OSDN Git Service

2007-11-22 Johannes Singler <singler@ira.uka.de>
authorsingler <singler@138bc75d-0d04-0410-961f-82ee72b054a4>
Thu, 22 Nov 2007 10:13:08 +0000 (10:13 +0000)
committersingler <singler@138bc75d-0d04-0410-961f-82ee72b054a4>
Thu, 22 Nov 2007 10:13:08 +0000 (10:13 +0000)
        PR libstdc++/33893
        * include/parallel/multiway_merge.h: made omp_dynamic-safe
        * include/parallel/workstealing.h: made omp_dynamic-safe
        * include/parallel/base.h: infrastructure, cleanup
        * include/parallel/par_loop.h: made omp_dynamic-safe
        * include/parallel/features.h: activate loser tree variant
        * include/parallel/quicksort.h: made omp_dynamic-safe
        * include/parallel/compiletime_settings.h: settings overridable
        * include/parallel/equally_split.h: made omp_dynamic-safe
        * include/parallel/omp_loop_static.h: made omp_dynamic-safe
        * include/parallel/random_shuffle.h: made omp_dynamic-safe
        * include/parallel/balanced_quicksort.h: made omp_dynamic-safe
        * include/parallel/set_operations.h: made omp_dynamic-safe
        * include/parallel/unique_copy.h: made omp_dynamic-safe
        * include/parallel/multiway_mergesort.h: made omp_dynamic-safe
        * include/parallel/search.h: made omp_dynamic-safe
        * include/parallel/partition.h: made omp_dynamic-safe
        * include/parallel/partial_sum.h: made omp_dynamic-safe
        * include/parallel/find.h: made omp_dynamic-safe
        * include/parallel/omp_loop.h: made omp_dynamic-safe
        * include/parallel/losertree.h: avoid default constructor

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@130347 138bc75d-0d04-0410-961f-82ee72b054a4

21 files changed:
libstdc++-v3/ChangeLog
libstdc++-v3/include/parallel/balanced_quicksort.h
libstdc++-v3/include/parallel/base.h
libstdc++-v3/include/parallel/compiletime_settings.h
libstdc++-v3/include/parallel/equally_split.h
libstdc++-v3/include/parallel/features.h
libstdc++-v3/include/parallel/find.h
libstdc++-v3/include/parallel/losertree.h
libstdc++-v3/include/parallel/multiway_merge.h
libstdc++-v3/include/parallel/multiway_mergesort.h
libstdc++-v3/include/parallel/omp_loop.h
libstdc++-v3/include/parallel/omp_loop_static.h
libstdc++-v3/include/parallel/par_loop.h
libstdc++-v3/include/parallel/partial_sum.h
libstdc++-v3/include/parallel/partition.h
libstdc++-v3/include/parallel/quicksort.h
libstdc++-v3/include/parallel/random_shuffle.h
libstdc++-v3/include/parallel/search.h
libstdc++-v3/include/parallel/set_operations.h
libstdc++-v3/include/parallel/unique_copy.h
libstdc++-v3/include/parallel/workstealing.h

index dbb620c..7415d55 100644 (file)
@@ -1,3 +1,27 @@
+2007-11-22  Johannes Singler  <singler@ira.uka.de>
+
+       PR libstdc++/33893
+        * include/parallel/multiway_merge.h: made omp_dynamic-safe
+        * include/parallel/workstealing.h: made omp_dynamic-safe
+        * include/parallel/base.h: infrastructure, cleanup
+        * include/parallel/par_loop.h: made omp_dynamic-safe
+        * include/parallel/features.h: activate loser tree variant
+        * include/parallel/quicksort.h: made omp_dynamic-safe
+        * include/parallel/compiletime_settings.h: settings overridable
+        * include/parallel/equally_split.h: made omp_dynamic-safe
+        * include/parallel/omp_loop_static.h: made omp_dynamic-safe
+        * include/parallel/random_shuffle.h: made omp_dynamic-safe
+        * include/parallel/balanced_quicksort.h: made omp_dynamic-safe
+        * include/parallel/set_operations.h: made omp_dynamic-safe
+        * include/parallel/unique_copy.h: made omp_dynamic-safe
+        * include/parallel/multiway_mergesort.h: made omp_dynamic-safe
+        * include/parallel/search.h: made omp_dynamic-safe
+        * include/parallel/partition.h: made omp_dynamic-safe
+        * include/parallel/partial_sum.h: made omp_dynamic-safe
+        * include/parallel/find.h: made omp_dynamic-safe
+        * include/parallel/omp_loop.h: made omp_dynamic-safe
+        * include/parallel/losertree.h: avoid default constructor
+
 2007-11-21  Jonathan Wakely  <jwakely.gcc@gmail.com>
 
        * docs/html/17_intro/C++STYLE: Fix typos.
index c282770..0d845ce 100644 (file)
 
 namespace __gnu_parallel
 {
-  /** @brief Information local to one thread in the parallel quicksort run. */
-  template<typename RandomAccessIterator>
+/** @brief Information local to one thread in the parallel quicksort run. */
+template<typename RandomAccessIterator>
   struct QSBThreadLocal
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::difference_type difference_type;
 
     /** @brief Continuous part of the sequence, described by an
-       iterator pair. */
+    iterator pair. */
     typedef std::pair<RandomAccessIterator, RandomAccessIterator> Piece;
 
     /** @brief Initial piece to work on. */
@@ -94,29 +94,17 @@ namespace __gnu_parallel
     QSBThreadLocal(int queue_size) : leftover_parts(queue_size) { }
   };
 
-  /** @brief Initialize the thread local storage.
-   *  @param tls Array of thread-local storages.
-   *  @param queue_size Size of the work-stealing queue. */
-  template<typename RandomAccessIterator>
-  inline void
-  qsb_initialize(QSBThreadLocal<RandomAccessIterator>** tls, int queue_size)
-  {
-    int iam = omp_get_thread_num();
-    tls[iam] = new QSBThreadLocal<RandomAccessIterator>(queue_size);
-  }
-
-
-  /** @brief Balanced quicksort divide step.
-   *  @param begin Begin iterator of subsequence.
-   *  @param end End iterator of subsequence.
-   *  @param comp Comparator.
-   *  @param num_threads Number of threads that are allowed to work on
-   *  this part.
-   *  @pre @c (end-begin)>=1 */
-  template<typename RandomAccessIterator, typename Comparator>
+/** @brief Balanced quicksort divide step.
+  *  @param begin Begin iterator of subsequence.
+  *  @param end End iterator of subsequence.
+  *  @param comp Comparator.
+  *  @param num_threads Number of threads that are allowed to work on
+  *  this part.
+  *  @pre @c (end-begin)>=1 */
+template<typename RandomAccessIterator, typename Comparator>
   inline typename std::iterator_traits<RandomAccessIterator>::difference_type
   qsb_divide(RandomAccessIterator begin, RandomAccessIterator end,
-            Comparator comp, int num_threads)
+             Comparator comp, thread_index_t num_threads)
   {
     _GLIBCXX_PARALLEL_ASSERT(num_threads > 0);
 
@@ -124,18 +112,20 @@ namespace __gnu_parallel
     typedef typename traits_type::value_type value_type;
     typedef typename traits_type::difference_type difference_type;
 
-    RandomAccessIterator pivot_pos = median_of_three_iterators(begin, begin + (end - begin) / 2, end  - 1, comp);
+    RandomAccessIterator pivot_pos = median_of_three_iterators(
+        begin, begin + (end - begin) / 2, end  - 1, comp);
 
 #if defined(_GLIBCXX_ASSERTIONS)
     // Must be in between somewhere.
     difference_type n = end - begin;
 
-    _GLIBCXX_PARALLEL_ASSERT((!comp(*pivot_pos, *begin) && !comp(*(begin + n / 2), *pivot_pos))
-          || (!comp(*pivot_pos, *begin) && !comp(*end, *pivot_pos))
-          || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*begin, *pivot_pos))
-          || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*end, *pivot_pos))
-          || (!comp(*pivot_pos, *end) && !comp(*begin, *pivot_pos))
-          || (!comp(*pivot_pos, *end) && !comp(*(begin + n / 2), *pivot_pos)));
+    _GLIBCXX_PARALLEL_ASSERT(
+           (!comp(*pivot_pos, *begin) && !comp(*(begin + n / 2), *pivot_pos))
+        || (!comp(*pivot_pos, *begin) && !comp(*end, *pivot_pos))
+        || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*begin, *pivot_pos))
+        || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*end, *pivot_pos))
+        || (!comp(*pivot_pos, *end) && !comp(*begin, *pivot_pos))
+        || (!comp(*pivot_pos, *end) && !comp(*(begin + n / 2), *pivot_pos)));
 #endif
 
     // Swap pivot value to end.
@@ -143,10 +133,12 @@ namespace __gnu_parallel
       std::swap(*pivot_pos, *(end - 1));
     pivot_pos = end - 1;
 
-    __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos);
+    __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool>
+        pred(comp, *pivot_pos);
 
     // Divide, returning end - begin - 1 in the worst case.
-    difference_type split_pos = parallel_partition(begin, end - 1, pred, num_threads);
+    difference_type split_pos = parallel_partition(
+        begin, end - 1, pred, num_threads);
 
     // Swap back pivot to middle.
     std::swap(*(begin + split_pos), *pivot_pos);
@@ -163,18 +155,21 @@ namespace __gnu_parallel
     return split_pos;
   }
 
-  /** @brief Quicksort conquer step.
-   *  @param tls Array of thread-local storages.
-   *  @param begin Begin iterator of subsequence.
-   *  @param end End iterator of subsequence.
-   *  @param comp Comparator.
-   *  @param iam Number of the thread processing this function.
-   *  @param num_threads Number of threads that are allowed to work on this part. */
-  template<typename RandomAccessIterator, typename Comparator>
+/** @brief Quicksort conquer step.
+  *  @param tls Array of thread-local storages.
+  *  @param begin Begin iterator of subsequence.
+  *  @param end End iterator of subsequence.
+  *  @param comp Comparator.
+  *  @param iam Number of the thread processing this function.
+  *  @param num_threads
+  *          Number of threads that are allowed to work on this part. */
+template<typename RandomAccessIterator, typename Comparator>
   inline void
   qsb_conquer(QSBThreadLocal<RandomAccessIterator>** tls,
-             RandomAccessIterator begin, RandomAccessIterator end,
-             Comparator comp, thread_index_t iam, thread_index_t num_threads)
+              RandomAccessIterator begin, RandomAccessIterator end,
+              Comparator comp,
+              thread_index_t iam, thread_index_t num_threads,
+              bool parent_wait)
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
@@ -182,14 +177,14 @@ namespace __gnu_parallel
 
     difference_type n = end - begin;
 
-    if (num_threads <= 1 || n < 2)
+    if (num_threads <= 1 || n <= 1)
       {
-       tls[iam]->initial.first  = begin;
-       tls[iam]->initial.second = end;
+        tls[iam]->initial.first  = begin;
+        tls[iam]->initial.second = end;
 
-       qsb_local_sort_with_helping(tls, comp, iam);
+        qsb_local_sort_with_helping(tls, comp, iam, parent_wait);
 
-       return;
+        return;
       }
 
     // Divide step.
@@ -199,33 +194,55 @@ namespace __gnu_parallel
     _GLIBCXX_PARALLEL_ASSERT(0 <= split_pos && split_pos < (end - begin));
 #endif
 
-    thread_index_t num_threads_leftside = std::max<thread_index_t>(1, std::min<thread_index_t>(num_threads - 1, split_pos * num_threads / n));
+    thread_index_t num_threads_leftside =
+        std::max<thread_index_t>(1, std::min<thread_index_t>(
+                          num_threads - 1, split_pos * num_threads / n));
 
-#pragma omp atomic
+#   pragma omp atomic
     *tls[iam]->elements_leftover -= (difference_type)1;
 
     // Conquer step.
-#pragma omp parallel sections num_threads(2)
+#   pragma omp parallel num_threads(2)
     {
-#pragma omp section
-      qsb_conquer(tls, begin, begin + split_pos, comp, iam, num_threads_leftside);
-      // The pivot_pos is left in place, to ensure termination.
-#pragma omp section
-      qsb_conquer(tls, begin + split_pos + 1, end, comp,
-                 iam + num_threads_leftside, num_threads - num_threads_leftside);
+      bool wait;
+      if(omp_get_num_threads() < 2)
+        wait = false;
+      else
+        wait = parent_wait;
+
+#     pragma omp sections
+        {
+#         pragma omp section
+            {
+              qsb_conquer(tls, begin, begin + split_pos, comp,
+                          iam,
+                          num_threads_leftside,
+                          wait);
+              wait = parent_wait;
+            }
+          // The pivot_pos is left in place, to ensure termination.
+#         pragma omp section
+            {
+              qsb_conquer(tls, begin + split_pos + 1, end, comp,
+                          iam + num_threads_leftside,
+                          num_threads - num_threads_leftside,
+                          wait);
+              wait = parent_wait;
+            }
+        }
     }
   }
 
-  /** 
-   *  @brief Quicksort step doing load-balanced local sort.
-   *  @param tls Array of thread-local storages.
-   *  @param comp Comparator.
-   *  @param iam Number of the thread processing this function. 
-   */
-  template<typename RandomAccessIterator, typename Comparator>
+/**
+  *  @brief Quicksort step doing load-balanced local sort.
+  *  @param tls Array of thread-local storages.
+  *  @param comp Comparator.
+  *  @param iam Number of the thread processing this function.
+  */
+template<typename RandomAccessIterator, typename Comparator>
   inline void
   qsb_local_sort_with_helping(QSBThreadLocal<RandomAccessIterator>** tls,
-                             Comparator& comp, int iam)
+                              Comparator& comp, int iam, bool wait)
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
@@ -251,151 +268,162 @@ namespace __gnu_parallel
 
     for (;;)
       {
-       // Invariant: current must be a valid (maybe empty) range.
-       RandomAccessIterator begin = current.first, end = current.second;
-       difference_type n = end - begin;
+        // Invariant: current must be a valid (maybe empty) range.
+        RandomAccessIterator begin = current.first, end = current.second;
+        difference_type n = end - begin;
 
-       if (n > base_case_n)
-         {
-           // Divide.
-           RandomAccessIterator pivot_pos = begin +  rng(n);
+        if (n > base_case_n)
+          {
+            // Divide.
+            RandomAccessIterator pivot_pos = begin +  rng(n);
 
-           // Swap pivot_pos value to end.
-           if (pivot_pos != (end - 1))
-             std::swap(*pivot_pos, *(end - 1));
-           pivot_pos = end - 1;
+            // Swap pivot_pos value to end.
+            if (pivot_pos != (end - 1))
+              std::swap(*pivot_pos, *(end - 1));
+            pivot_pos = end - 1;
 
-           __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos);
+            __gnu_parallel::binder2nd
+                <Comparator, value_type, value_type, bool>
+                pred(comp, *pivot_pos);
 
-           // Divide, leave pivot unchanged in last place.
-           RandomAccessIterator split_pos1, split_pos2;
-           split_pos1 = __gnu_sequential::partition(begin, end - 1, pred);
+            // Divide, leave pivot unchanged in last place.
+            RandomAccessIterator split_pos1, split_pos2;
+            split_pos1 = __gnu_sequential::partition(begin, end - 1, pred);
 
-           // Left side: < pivot_pos; right side: >= pivot_pos.
+            // Left side: < pivot_pos; right side: >= pivot_pos.
 #if _GLIBCXX_ASSERTIONS
-           _GLIBCXX_PARALLEL_ASSERT(begin <= split_pos1 && split_pos1 < end);
+            _GLIBCXX_PARALLEL_ASSERT(begin <= split_pos1 && split_pos1 < end);
 #endif
-           // Swap pivot back to middle.
-           if (split_pos1 != pivot_pos)
-             std::swap(*split_pos1, *pivot_pos);
-           pivot_pos = split_pos1;
-
-           // In case all elements are equal, split_pos1 == 0.
-           if ((split_pos1 + 1 - begin) < (n >> 7)
-               || (end - split_pos1) < (n >> 7))
-             {
-               // Very unequal split, one part smaller than one 128th
-               // elements not strictly larger than the pivot.
-               __gnu_parallel::unary_negate<__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>, value_type> pred(__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>(comp, *pivot_pos));
-
-               // Find other end of pivot-equal range.
-               split_pos2 = __gnu_sequential::partition(split_pos1 + 1, end, pred);
-             }
-           else
-             {
-               // Only skip the pivot.
-               split_pos2 = split_pos1 + 1;
-             }
-
-           // Elements equal to pivot are done.
-           elements_done += (split_pos2 - split_pos1);
+            // Swap pivot back to middle.
+            if (split_pos1 != pivot_pos)
+              std::swap(*split_pos1, *pivot_pos);
+            pivot_pos = split_pos1;
+
+            // In case all elements are equal, split_pos1 == 0.
+            if ((split_pos1 + 1 - begin) < (n >> 7)
+            || (end - split_pos1) < (n >> 7))
+              {
+                // Very unequal split, one part smaller than one 128th
+                // elements not strictly larger than the pivot.
+                __gnu_parallel::unary_negate<__gnu_parallel::binder1st
+                    <Comparator, value_type, value_type, bool>, value_type>
+                    pred(__gnu_parallel::binder1st
+                        <Comparator, value_type, value_type, bool>(
+                        comp, *pivot_pos));
+
+                // Find other end of pivot-equal range.
+                split_pos2 = __gnu_sequential::partition(
+                    split_pos1 + 1, end, pred);
+              }
+            else
+              // Only skip the pivot.
+              split_pos2 = split_pos1 + 1;
+
+            // Elements equal to pivot are done.
+            elements_done += (split_pos2 - split_pos1);
 #if _GLIBCXX_ASSERTIONS
-           total_elements_done += (split_pos2 - split_pos1);
+            total_elements_done += (split_pos2 - split_pos1);
 #endif
-           // Always push larger part onto stack.
-           if (((split_pos1 + 1) - begin) < (end - (split_pos2)))
-             {
-               // Right side larger.
-               if ((split_pos2) != end)
-                 tl.leftover_parts.push_front(std::make_pair(split_pos2, end));
-
-               //current.first = begin;        //already set anyway
-               current.second = split_pos1;
-               continue;
-             }
-           else
-             {
-               // Left side larger.
-               if (begin != split_pos1)
-                 tl.leftover_parts.push_front(std::make_pair(begin, split_pos1));
-
-               current.first = split_pos2;
-               //current.second = end; //already set anyway
-               continue;
-             }
-         }
-       else
-         {
-           __gnu_sequential::sort(begin, end, comp);
-           elements_done += n;
+            // Always push larger part onto stack.
+            if (((split_pos1 + 1) - begin) < (end - (split_pos2)))
+              {
+                // Right side larger.
+                if ((split_pos2) != end)
+                  tl.leftover_parts.push_front(std::make_pair(split_pos2, end));
+
+                //current.first = begin;       //already set anyway
+                current.second = split_pos1;
+                continue;
+              }
+            else
+              {
+                // Left side larger.
+                if (begin != split_pos1)
+                  tl.leftover_parts.push_front(
+                      std::make_pair(begin, split_pos1));
+
+                current.first = split_pos2;
+                //current.second = end;        //already set anyway
+                continue;
+              }
+          }
+        else
+          {
+            __gnu_sequential::sort(begin, end, comp);
+            elements_done += n;
 #if _GLIBCXX_ASSERTIONS
-           total_elements_done += n;
+            total_elements_done += n;
 #endif
 
-           // Prefer own stack, small pieces.
-           if (tl.leftover_parts.pop_front(current))
-             continue;
+            // Prefer own stack, small pieces.
+            if (tl.leftover_parts.pop_front(current))
+              continue;
 
-#pragma omp atomic
-           *tl.elements_leftover -= elements_done;
-           elements_done = 0;
+#           pragma omp atomic
+            *tl.elements_leftover -= elements_done;
+
+            elements_done = 0;
 
 #if _GLIBCXX_ASSERTIONS
-           double search_start = omp_get_wtime();
+            double search_start = omp_get_wtime();
 #endif
 
-           // Look for new work.
-           bool success = false;
-           while (*tl.elements_leftover > 0 && !success
+            // Look for new work.
+            bool successfully_stolen = false;
+            while (wait && *tl.elements_leftover > 0 && !successfully_stolen
 #if _GLIBCXX_ASSERTIONS
-                  // Possible dead-lock.
-                  && (omp_get_wtime() < (search_start + 1.0))
+              // Possible dead-lock.
+              && (omp_get_wtime() < (search_start + 1.0))
 #endif
-                  )
-             {
-               thread_index_t victim;
-               victim = rng(num_threads);
-
-               // Large pieces.
-               success = (victim != iam) && tls[victim]->leftover_parts.pop_back(current);
-               if (!success)
-                 yield();
+              )
+              {
+                thread_index_t victim;
+                victim = rng(num_threads);
+
+                // Large pieces.
+                successfully_stolen = (victim != iam)
+                    && tls[victim]->leftover_parts.pop_back(current);
+                if (!successfully_stolen)
+                  yield();
 #if !defined(__ICC) && !defined(__ECC)
-#pragma omp flush
+#               pragma omp flush
 #endif
-             }
+              }
 
 #if _GLIBCXX_ASSERTIONS
-           if (omp_get_wtime() >= (search_start + 1.0))
-             {
-               sleep(1);
-               _GLIBCXX_PARALLEL_ASSERT(omp_get_wtime() < (search_start + 1.0));
-             }
+            if (omp_get_wtime() >= (search_start + 1.0))
+              {
+                sleep(1);
+                _GLIBCXX_PARALLEL_ASSERT(
+                    omp_get_wtime() < (search_start + 1.0));
+              }
 #endif
-           if (!success)
-             {
+            if (!successfully_stolen)
+              {
 #if _GLIBCXX_ASSERTIONS
-               _GLIBCXX_PARALLEL_ASSERT(*tl.elements_leftover == 0);
+                _GLIBCXX_PARALLEL_ASSERT(*tl.elements_leftover == 0);
 #endif
-               return;
-             }
-         }
+                return;
+              }
+          }
       }
   }
 
-  /** @brief Top-level quicksort routine.
-   *  @param begin Begin iterator of sequence.
-   *  @param end End iterator of sequence.
-   *  @param comp Comparator.
-   *  @param n Length of the sequence to sort.
-   *  @param num_threads Number of threads that are allowed to work on
-   *  this part.
-   */
-  template<typename RandomAccessIterator, typename Comparator>
+/** @brief Top-level quicksort routine.
+  *  @param begin Begin iterator of sequence.
+  *  @param end End iterator of sequence.
+  *  @param comp Comparator.
+  *  @param n Length of the sequence to sort.
+  *  @param num_threads Number of threads that are allowed to work on
+  *  this part.
+  */
+template<typename RandomAccessIterator, typename Comparator>
   inline void
   parallel_sort_qsb(RandomAccessIterator begin, RandomAccessIterator end,
-                   Comparator comp,
-                   typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads)
+                    Comparator comp,
+                    typename std::iterator_traits<RandomAccessIterator>
+                        ::difference_type n,
+                    thread_index_t num_threads)
   {
     _GLIBCXX_CALL(end - begin)
 
@@ -413,11 +441,11 @@ namespace __gnu_parallel
     if (num_threads > n)
       num_threads = static_cast<thread_index_t>(n);
 
+    // Initialize thread local storage
     tls_type** tls = new tls_type*[num_threads];
-
-#pragma omp parallel num_threads(num_threads)
-    // Initialize variables per processor.
-    qsb_initialize(tls, num_threads * (thread_index_t)(log2(n) + 1));
+    difference_type queue_size = num_threads * (thread_index_t)(log2(n) + 1);
+    for (thread_index_t t = 0; t < num_threads; ++t)
+      tls[t] = new QSBThreadLocal<RandomAccessIterator>(queue_size);
 
     // There can never be more than ceil(log2(n)) ranges on the stack, because
     // 1. Only one processor pushes onto the stack
@@ -426,22 +454,16 @@ namespace __gnu_parallel
     volatile difference_type elements_leftover = n;
     for (int i = 0; i < num_threads; i++)
       {
-       tls[i]->elements_leftover = &elements_leftover;
-       tls[i]->num_threads = num_threads;
-       tls[i]->global = std::make_pair(begin, end);
+        tls[i]->elements_leftover = &elements_leftover;
+        tls[i]->num_threads = num_threads;
+        tls[i]->global = std::make_pair(begin, end);
 
-       // Just in case nothing is left to assign.
-       tls[i]->initial = std::make_pair(end, end);
+        // Just in case nothing is left to assign.
+        tls[i]->initial = std::make_pair(end, end);
       }
 
-    // Initial splitting, recursively.
-    int old_nested = omp_get_nested();
-    omp_set_nested(true);
-
     // Main recursion call.
-    qsb_conquer(tls, begin, begin + n, comp, 0, num_threads);
-
-    omp_set_nested(old_nested);
+    qsb_conquer(tls, begin, begin + n, comp, 0, num_threads, true);
 
 #if _GLIBCXX_ASSERTIONS
     // All stack must be empty.
index 0a86d83..5a756cd 100644 (file)
@@ -49,54 +49,70 @@ namespace __gnu_parallel
   // XXX remove std::duplicates from here if possible,
   // XXX but keep minimal dependencies.
 
-  /** @brief Calculates the rounded-down logarithm of @c n for base 2.
-   *  @param n Argument.
-   *  @return Returns 0 for argument 0.
-   */
-  template<typename Size> 
-    inline Size 
-    log2(Size n)
+/** @brief Calculates the rounded-down logarithm of @c n for base 2.
+  *  @param n Argument.
+  *  @return Returns 0 for argument 0.
+  */
+template<typename Size>
+  inline Size
+  log2(Size n)
     {
       Size k;
       for (k = 0; n != 1; n >>= 1)
-       ++k;
+        ++k;
       return k;
     }
 
-  /** @brief Encode two integers into one __gnu_parallel::lcas_t.
-   *  @param a First integer, to be encoded in the most-significant @c
-   *  lcas_t_bits/2 bits.
-   *  @param b Second integer, to be encoded in the least-significant
-   *  @c lcas_t_bits/2 bits.
-   *  @return __gnu_parallel::lcas_t value encoding @c a and @c b.
-   *  @see decode2 
-   */
-  inline lcas_t
-  encode2(int a, int b)        //must all be non-negative, actually
+/** @brief Encode two integers into one __gnu_parallel::lcas_t.
+  *  @param a First integer, to be encoded in the most-significant @c
+  *  lcas_t_bits/2 bits.
+  *  @param b Second integer, to be encoded in the least-significant
+  *  @c lcas_t_bits/2 bits.
+  *  @return __gnu_parallel::lcas_t value encoding @c a and @c b.
+  *  @see decode2
+  */
+inline lcas_t
+encode2(int a, int b)  //must all be non-negative, actually
+{
+  return (((lcas_t)a) << (lcas_t_bits / 2)) | (((lcas_t)b) << 0);
+}
+
+/** @brief Decode two integers from one __gnu_parallel::lcas_t.
+  *  @param x __gnu_parallel::lcas_t to decode integers from.
+  *  @param a First integer, to be decoded from the most-significant
+  *  @c lcas_t_bits/2 bits of @c x.
+  *  @param b Second integer, to be encoded in the least-significant
+  *  @c lcas_t_bits/2 bits of @c x.
+  *  @see encode2
+  */
+inline void
+decode2(lcas_t x, int& a, int& b)
+{
+  a = (int)((x >> (lcas_t_bits / 2)) & lcas_t_mask);
+  b = (int)((x >>               0 ) & lcas_t_mask);
+}
+
+/** @brief Equivalent to std::min. */
+template<typename T>
+  const T&
+  min(const T& a, const T& b)
   {
-    return (((lcas_t)a) << (lcas_t_bits / 2)) | (((lcas_t)b) << 0);
-  }
+    return (a < b) ? a : b;
+  };
 
-  /** @brief Decode two integers from one __gnu_parallel::lcas_t.
-   *  @param x __gnu_parallel::lcas_t to decode integers from.
-   *  @param a First integer, to be decoded from the most-significant
-   *  @c lcas_t_bits/2 bits of @c x.
-   *  @param b Second integer, to be encoded in the least-significant
-   *  @c lcas_t_bits/2 bits of @c x.
-   *  @see encode2
-   */
-  inline void
-  decode2(lcas_t x, int& a, int& b)
+/** @brief Equivalent to std::max. */
+template<typename T>
+  const T&
+  max(const T& a, const T& b)
   {
-    a = (int)((x >> (lcas_t_bits / 2)) & lcas_t_mask);
-    b = (int)((x >>               0 ) & lcas_t_mask);
-  }
+    return (a > b) ? a : b;
+  };
 
-  /** @brief Constructs predicate for equality from strict weak
-   *  ordering predicate
-   */
-  // XXX comparator at the end, as per others
-  template<typename Comparator, typename T1, typename T2>
+/** @brief Constructs predicate for equality from strict weak
+  *  ordering predicate
+  */
+// XXX comparator at the end, as per others
+template<typename Comparator, typename T1, typename T2>
   class equal_from_less : public std::binary_function<T1, T2, bool>
   {
   private:
@@ -112,162 +128,176 @@ namespace __gnu_parallel
   };
 
 
-  /** @brief Similar to std::binder1st, but giving the argument types explicitly. */
-  template<typename _Predicate, typename argument_type>
-    class unary_negate
-    : public std::unary_function<argument_type, bool>
-    {
-    protected:
-      _Predicate _M_pred;
-
-    public:
-      explicit
-      unary_negate(const _Predicate& __x) : _M_pred(__x) { }
-
-      bool
-      operator()(const argument_type& __x)
-      { return !_M_pred(__x); }
-    };
-
-  /** @brief Similar to std::binder1st, but giving the argument types explicitly. */
-  template<typename _Operation, typename first_argument_type, typename second_argument_type, typename result_type>
-    class binder1st
-    : public std::unary_function<second_argument_type, result_type>
-    {
-    protected:
-      _Operation op;
-      first_argument_type value;
-
-    public:
-      binder1st(const _Operation& __x,
-               const first_argument_type& __y)
-      : op(__x), value(__y) { }
-
-      result_type
-      operator()(const second_argument_type& __x)
-      { return op(value, __x); }
-
-      // _GLIBCXX_RESOLVE_LIB_DEFECTS
-      // 109.  Missing binders for non-const sequence elements
-      result_type
-      operator()(second_argument_type& __x) const
-      { return op(value, __x); }
-    };
-
-  /** 
-   *  @brief Similar to std::binder2nd, but giving the argument types
-   *  explicitly. 
-   */
-  template<typename _Operation, typename first_argument_type, typename second_argument_type, typename result_type>
-    class binder2nd
-    : public std::unary_function<first_argument_type, result_type>
-    {
-    protected:
-      _Operation op;
-      second_argument_type value;
-
-    public:
-      binder2nd(const _Operation& __x,
-               const second_argument_type& __y)
-      : op(__x), value(__y) { }
-
-      result_type
-      operator()(const first_argument_type& __x) const
-      { return op(__x, value); }
-
-      // _GLIBCXX_RESOLVE_LIB_DEFECTS
-      // 109.  Missing binders for non-const sequence elements
-      result_type
-      operator()(first_argument_type& __x)
-      { return op(__x, value); }
-    };
-
-  /** @brief Similar to std::equal_to, but allows two different types. */
-  template<typename T1, typename T2>
+/** @brief Similar to std::binder1st,
+  *  but giving the argument types explicitly. */
+template<typename _Predicate, typename argument_type>
+  class unary_negate
+  : public std::unary_function<argument_type, bool>
+  {
+  protected:
+    _Predicate _M_pred;
+
+  public:
+    explicit
+    unary_negate(const _Predicate& __x) : _M_pred(__x) { }
+
+    bool
+    operator()(const argument_type& __x)
+    { return !_M_pred(__x); }
+  };
+
+/** @brief Similar to std::binder1st,
+  *  but giving the argument types explicitly. */
+template<
+    typename _Operation,
+    typename first_argument_type,
+    typename second_argument_type,
+    typename result_type>
+  class binder1st
+  : public std::unary_function<second_argument_type, result_type>
+  {
+  protected:
+    _Operation op;
+    first_argument_type value;
+
+  public:
+    binder1st(const _Operation& __x,
+              const first_argument_type& __y)
+    : op(__x), value(__y) { }
+
+    result_type
+    operator()(const second_argument_type& __x)
+    { return op(value, __x); }
+
+    // _GLIBCXX_RESOLVE_LIB_DEFECTS
+    // 109.  Missing binders for non-const sequence elements
+    result_type
+    operator()(second_argument_type& __x) const
+    { return op(value, __x); }
+  };
+
+/**
+  *  @brief Similar to std::binder2nd, but giving the argument types
+  *  explicitly.
+  */
+template<
+    typename _Operation,
+    typename first_argument_type,
+    typename second_argument_type,
+    typename result_type>
+  class binder2nd
+  : public std::unary_function<first_argument_type, result_type>
+  {
+  protected:
+    _Operation op;
+    second_argument_type value;
+
+  public:
+    binder2nd(const _Operation& __x,
+              const second_argument_type& __y)
+    : op(__x), value(__y) { }
+
+    result_type
+    operator()(const first_argument_type& __x) const
+    { return op(__x, value); }
+
+    // _GLIBCXX_RESOLVE_LIB_DEFECTS
+    // 109.  Missing binders for non-const sequence elements
+    result_type
+    operator()(first_argument_type& __x)
+    { return op(__x, value); }
+  };
+
+/** @brief Similar to std::equal_to, but allows two different types. */
+template<typename T1, typename T2>
   struct equal_to : std::binary_function<T1, T2, bool>
   {
     bool operator()(const T1& t1, const T2& t2) const
     { return t1 == t2; }
   };
 
-  /** @brief Similar to std::less, but allows two different types. */
-  template<typename T1, typename T2>
+/** @brief Similar to std::less, but allows two different types. */
+template<typename T1, typename T2>
   struct less : std::binary_function<T1, T2, bool>
   {
-    bool 
+    bool
     operator()(const T1& t1, const T2& t2) const
     { return t1 < t2; }
 
-    bool 
+    bool
     operator()(const T2& t2, const T1& t1) const
     { return t2 < t1; }
   };
 
-  // Partial specialization for one type. Same as std::less.
-  template<typename _Tp>
-  struct less<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, bool>
-    {
-      bool
-      operator()(const _Tp& __x, const _Tp& __y) const
-      { return __x < __y; }
-    };
+// Partial specialization for one type. Same as std::less.
+template<typename _Tp>
+struct less<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, bool>
+  {
+    bool
+    operator()(const _Tp& __x, const _Tp& __y) const
+    { return __x < __y; }
+  };
 
 
-    /** @brief Similar to std::plus, but allows two different types. */
-  template<typename _Tp1, typename _Tp2>
-    struct plus : public std::binary_function<_Tp1, _Tp2, _Tp1>
-    {
-      typedef typeof(*static_cast<_Tp1*>(NULL) + *static_cast<_Tp2*>(NULL)) result;
+  /** @brief Similar to std::plus, but allows two different types. */
+template<typename _Tp1, typename _Tp2>
+  struct plus : public std::binary_function<_Tp1, _Tp2, _Tp1>
+  {
+    typedef typeof(*static_cast<_Tp1*>(NULL)
+                    + *static_cast<_Tp2*>(NULL)) result;
 
-      result
-      operator()(const _Tp1& __x, const _Tp2& __y) const
-      { return __x + __y; }
-    };
+    result
+    operator()(const _Tp1& __x, const _Tp2& __y) const
+    { return __x + __y; }
+  };
 
-  // Partial specialization for one type. Same as std::plus.
-  template<typename _Tp>
-    struct plus<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp>
-    {
-      typedef typeof(*static_cast<_Tp*>(NULL) + *static_cast<_Tp*>(NULL)) result;
+// Partial specialization for one type. Same as std::plus.
+template<typename _Tp>
+  struct plus<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp>
+  {
+    typedef typeof(*static_cast<_Tp*>(NULL)
+                    + *static_cast<_Tp*>(NULL)) result;
 
-      result
-      operator()(const _Tp& __x, const _Tp& __y) const
-      { return __x + __y; }
-    };
+    result
+    operator()(const _Tp& __x, const _Tp& __y) const
+    { return __x + __y; }
+  };
 
 
-  /** @brief Similar to std::multiplies, but allows two different types. */
-  template<typename _Tp1, typename _Tp2>
-    struct multiplies : public std::binary_function<_Tp1, _Tp2, _Tp1>
-    {
-      typedef typeof(*static_cast<_Tp1*>(NULL) * *static_cast<_Tp2*>(NULL)) result;
+/** @brief Similar to std::multiplies, but allows two different types. */
+template<typename _Tp1, typename _Tp2>
+  struct multiplies : public std::binary_function<_Tp1, _Tp2, _Tp1>
+  {
+    typedef typeof(*static_cast<_Tp1*>(NULL)
+                    * *static_cast<_Tp2*>(NULL)) result;
 
-      result
-      operator()(const _Tp1& __x, const _Tp2& __y) const
-      { return __x * __y; }
-    };
+    result
+    operator()(const _Tp1& __x, const _Tp2& __y) const
+    { return __x * __y; }
+  };
 
-  // Partial specialization for one type. Same as std::multiplies.
-  template<typename _Tp>
-    struct multiplies<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp>
-    {
-      typedef typeof(*static_cast<_Tp*>(NULL) * *static_cast<_Tp*>(NULL)) result;
+// Partial specialization for one type. Same as std::multiplies.
+template<typename _Tp>
+  struct multiplies<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp>
+  {
+    typedef typeof(*static_cast<_Tp*>(NULL)
+                    * *static_cast<_Tp*>(NULL)) result;
 
-      result
-      operator()(const _Tp& __x, const _Tp& __y) const
-      { return __x * __y; }
-    };
+    result
+    operator()(const _Tp& __x, const _Tp& __y) const
+    { return __x * __y; }
+  };
 
 
-  template<typename T, typename _DifferenceTp>
+template<typename T, typename _DifferenceTp>
   class pseudo_sequence;
 
-  /** @brief Iterator associated with __gnu_parallel::pseudo_sequence.
-   *  If features the usual random-access iterator functionality.
-   *  @param T Sequence value type.
-   *  @param difference_type Sequence difference type. 
-   */
-  template<typename T, typename _DifferenceTp>
+/** @brief Iterator associated with __gnu_parallel::pseudo_sequence.
+  *  If features the usual random-access iterator functionality.
+  *  @param T Sequence value type.
+  *  @param difference_type Sequence difference type.
+  */
+template<typename T, typename _DifferenceTp>
   class pseudo_sequence_iterator
   {
   public:
@@ -296,34 +326,34 @@ namespace __gnu_parallel
     operator++(int)
     { return type(pos++); }
 
-    const T& 
+    const T&
     operator*() const
     { return val; }
 
-    const T& 
+    const T&
     operator[](difference_type) const
     { return val; }
 
-    bool 
+    bool
     operator==(const type& i2)
     { return pos == i2.pos; }
 
-    difference_type 
+    difference_type
     operator!=(const type& i2)
     { return pos != i2.pos; }
 
-    difference_type 
+    difference_type
     operator-(const type& i2)
     { return pos - i2.pos; }
   };
 
-  /** @brief Sequence that conceptually consists of multiple copies of
-      the same element.
-   *  The copies are not stored explicitly, of course.
-   *  @param T Sequence value type.
-   *  @param difference_type Sequence difference type. 
-   */
-  template<typename T, typename _DifferenceTp>
+/** @brief Sequence that conceptually consists of multiple copies of
+    the same element.
+  *  The copies are not stored explicitly, of course.
+  *  @param T Sequence value type.
+  *  @param difference_type Sequence difference type.
+  */
+template<typename T, typename _DifferenceTp>
   class pseudo_sequence
   {
     typedef pseudo_sequence<T, _DifferenceTp> type;
@@ -335,10 +365,10 @@ namespace __gnu_parallel
     typedef pseudo_sequence_iterator<T, uint64> iterator;
 
     /** @brief Constructor.
-     *  @param val Element of the sequence.
-     *  @param count Number of (virtual) copies.
-     */
-    pseudo_sequence(const T& val, difference_type count) 
+      *  @param val Element of the sequence.
+      *  @param count Number of (virtual) copies.
+      */
+    pseudo_sequence(const T& val, difference_type count)
     : val(val), count(count)  { }
 
     /** @brief Begin iterator. */
@@ -356,67 +386,66 @@ namespace __gnu_parallel
     difference_type count;
   };
 
-  /** @brief Functor that does nothing */
-  template<typename _ValueTp>
+/** @brief Functor that does nothing */
+template<typename _ValueTp>
   class void_functor
   {
-    inline void 
+    inline void
     operator()(const _ValueTp& v) const { }
   };
 
-  /** @brief Compute the median of three referenced elements,
-      according to @c comp.
-   *  @param a First iterator.
-   *  @param b Second iterator.
-   *  @param c Third iterator.
-   *  @param comp Comparator. 
-   */
-  template<typename RandomAccessIterator, typename Comparator>
-  RandomAccessIterator
-  median_of_three_iterators(RandomAccessIterator a, RandomAccessIterator b, 
-                           RandomAccessIterator c, Comparator& comp)
+/** @brief Compute the median of three referenced elements,
+    according to @c comp.
+  *  @param a First iterator.
+  *  @param b Second iterator.
+  *  @param c Third iterator.
+  *  @param comp Comparator.
+  */
+template<typename RandomAccessIterator, typename Comparator>
+RandomAccessIterator
+  median_of_three_iterators(RandomAccessIterator a, RandomAccessIterator b,
+                            RandomAccessIterator c, Comparator& comp)
   {
     if (comp(*a, *b))
       if (comp(*b, *c))
-       return b;
+        return b;
       else
-       if (comp(*a, *c))
-         return c;
-       else
-         return a;
+        if (comp(*a, *c))
+          return c;
+        else
+          return a;
     else
       {
-       // Just swap a and b.
-       if (comp(*a, *c))
-         return a;
-       else
-         if (comp(*b, *c))
-           return c;
-         else
-           return b;
+        // Just swap a and b.
+        if (comp(*a, *c))
+          return a;
+        else
+          if (comp(*b, *c))
+            return c;
+          else
+            return b;
       }
   }
 
-  // Avoid the use of assert, because we're trying to keep the <cassert>
-  // include out of the mix. (Same as debug mode).
-  inline void
-  __replacement_assert(const char* __file, int __line, 
-                      const char* __function, const char* __condition)
-  {
-    std::printf("%s:%d: %s: Assertion '%s' failed.\n", __file, __line,
-               __function, __condition);
-    __builtin_abort();
-  }
-  
+// Avoid the use of assert, because we're trying to keep the <cassert>
+// include out of the mix. (Same as debug mode).
+inline void
+__replacement_assert(const char* __file, int __line,
+                     const char* __function, const char* __condition)
+{
+  std::printf("%s:%d: %s: Assertion '%s' failed.\n", __file, __line,
+              __function, __condition);
+  __builtin_abort();
+}
+
 #define _GLIBCXX_PARALLEL_ASSERT(_Condition)                            \
-  do                                                                   \
-    {                                                                  \
-      if (!(_Condition))                                               \
-       __gnu_parallel::__replacement_assert(__FILE__, __LINE__,        \
-                                   __PRETTY_FUNCTION__, #_Condition);  \
-    } while (false)
-  
+do                                                                     \
+  {                                                                    \
+    if (!(_Condition))                                         \
+      __gnu_parallel::__replacement_assert(__FILE__, __LINE__, \
+                                  __PRETTY_FUNCTION__, #_Condition);   \
+  } while (false)
+
 } //namespace __gnu_parallel
 
 #endif
-
index 717bda5..edaea38 100644 (file)
@@ -39,7 +39,7 @@
 #include <cstdio>
 
 /** @brief Determine verbosity level of the parallel mode.
- *  Level 1 prints a message each time when entering a parallel-mode function. */
+ *  Level 1 prints a message each time a parallel-mode function is entered. */
 #define _GLIBCXX_VERBOSE_LEVEL 0
 
 /** @def _GLIBCXX_CALL
 #define _GLIBCXX_CALL(n)
 #endif
 #if (_GLIBCXX_VERBOSE_LEVEL == 1)
-#define _GLIBCXX_CALL(n) printf("   %s:\niam = %d, n = %ld, num_threads = %d\n", __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
+#define _GLIBCXX_CALL(n) \
+  printf("   %s:\niam = %d, n = %ld, num_threads = %d\n", \
+  __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
 #endif
 
+#ifndef _GLIBCXX_SCALE_DOWN_FPU
 /** @brief Use floating-point scaling instead of modulo for mapping
  *  random numbers to a range.  This can be faster on certain CPUs. */
 #define _GLIBCXX_SCALE_DOWN_FPU 0
+#endif
 
+#ifndef _GLIBCXX_ASSERTIONS
 /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
  *  Should be switched on only locally. */
 #define _GLIBCXX_ASSERTIONS 0
+#endif
 
+#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
 /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
- *  Consider the size of the L1 cache for __gnu_parallel::parallel_random_shuffle(). */
+ *  Consider the size of the L1 cache for
+ *  __gnu_parallel::parallel_random_shuffle(). */
 #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 0
+#endif
+#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 
 /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
- *  Consider the size of the TLB for __gnu_parallel::parallel_random_shuffle(). */
+ *  Consider the size of the TLB for
+ *  __gnu_parallel::parallel_random_shuffle(). */
 #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 0
+#endif
 
+#ifndef _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
 /** @brief First copy the data, sort it locally, and merge it back
  * (0); or copy it back after everything is done (1).
  *
  *  Recommendation: 0 */
 #define _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST 0
-
+#endif
index 730875d..4c4167e 100644 (file)
 
 namespace __gnu_parallel
 {
-  /** @brief Function to split a sequence into parts of almost equal size.
  *
  *  The resulting sequence s of length p+1 contains the splitting
  *  positions when splitting the range [0,n) into parts of almost
  *  equal size (plus minus 1).  The first entry is 0, the last one
  *  n. There may result empty parts.
  *  @param n Number of elements
  *  @param p Number of parts
  *  @param s Splitters
  *  @returns End of splitter sequence, i. e. @c s+p+1 */
-  template<typename _DifferenceTp, typename OutputIterator>
+/** @brief Function to split a sequence into parts of almost equal size.
+ *
*  The resulting sequence s of length num_threads+1 contains the splitting
+ *  positions when splitting the range [0,n) into parts of almost
+ *  equal size (plus minus 1).  The first entry is 0, the last one
+ *  n. There may result empty parts.
+ *  @param n Number of elements
*  @param num_threads Number of parts
+ *  @param s Splitters
*  @returns End of splitter sequence, i. e. @c s+num_threads+1 */
+template<typename difference_type, typename OutputIterator>
   OutputIterator
-  equally_split(_DifferenceTp n, thread_index_t p, OutputIterator s)
+  equally_split(difference_type n,
+                thread_index_t num_threads,
+                OutputIterator s)
   {
-    typedef _DifferenceTp difference_type;
-    difference_type chunk_length = n / p, split = n % p, start = 0;
-    for (int i = 0; i < p; i++)
+    difference_type chunk_length = n / num_threads,
+                    num_longer_chunks = n % num_threads,
+                    pos = 0;
+    for (thread_index_t i = 0; i < num_threads; ++i)
       {
-       *s++ = start;
-       start += (difference_type(i) < split) ? (chunk_length + 1) : chunk_length;
+        *s++ = pos;
+        pos += (i < num_longer_chunks) ? (chunk_length + 1) : chunk_length;
       }
     *s++ = n;
     return s;
   }
+
+
+/** @brief Function to split a sequence into parts of almost equal size.
+ *
+ *  Returns the position of the splitting point between
+ *  thread number thread_no (included) and
+ *  thread number thread_no+1 (excluded).
+ *  @param n Number of elements
+ *  @param num_threads Number of parts
+ *  @returns Splitting point */
+template<typename difference_type>
+  difference_type
+  equally_split_point(difference_type n,
+                      thread_index_t num_threads,
+                      thread_index_t thread_no)
+  {
+    difference_type chunk_length = n / num_threads,
+                    num_longer_chunks = n % num_threads;
+
+    if(thread_no < num_longer_chunks)
+      return thread_no * (chunk_length + 1);
+    else
+      return num_longer_chunks * (chunk_length + 1)
+          + (thread_no - num_longer_chunks) * chunk_length;
+  }
 }
 
 #endif
index c67ffb5..a78c16b 100644 (file)
@@ -66,7 +66,7 @@
  *  @brief Include guarded (sequences may run empty) loser tree,
  *  moving objects.
  *  @see __gnu_parallel::Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE 0
+#define _GLIBCXX_LOSER_TREE 1
 #endif
 
 #ifndef _GLIBCXX_LOSER_TREE_EXPLICIT
index bc89fa5..2a5b22c 100644 (file)
@@ -10,7 +10,7 @@
 
 // This library is distributed in the hope that it will be useful, but
 // WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURstartE.  See the GNU
 // General Public License for more details.
 
 // You should have received a copy of the GNU General Public License
 
 namespace __gnu_parallel
 {
-  /** 
-   *  @brief Parallel std::find, switch for different algorithms.
-   *  @param begin1 Begin iterator of first sequence.
-   *  @param end1 End iterator of first sequence.
-   *  @param begin2 Begin iterator of second sequence. Must have same
-   *  length as first sequence.
-   *  @param pred Find predicate.
-   *  @param selector Functionality (e. g. std::find_if (), std::equal(),...)
-   *  @return Place of finding in both sequences. 
-   */
-  template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector>
+/**
+ *  @brief Parallel std::find, switch for different algorithms.
+ *  @param begin1 Begin iterator of first sequence.
+ *  @param end1 End iterator of first sequence.
+ *  @param begin2 Begin iterator of second sequence. Must have same
+ *  length as first sequence.
+ *  @param pred Find predicate.
+ *  @param selector Functionality (e. g. std::find_if (), std::equal(),...)
+ *  @return Place of finding in both sequences.
+ */
+template<
+    typename RandomAccessIterator1,
+    typename RandomAccessIterator2,
+    typename Pred,
+    typename Selector>
   std::pair<RandomAccessIterator1, RandomAccessIterator2>
   find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
-               RandomAccessIterator2 begin2, Pred pred, Selector selector)
+                RandomAccessIterator2 begin2, Pred pred, Selector selector)
   {
     switch (Settings::find_distribution)
       {
       case Settings::GROWING_BLOCKS:
-       return find_template(begin1, end1, begin2, pred, selector, growing_blocks_tag());
+        return find_template(begin1, end1, begin2, pred, selector,
+                            growing_blocks_tag());
       case Settings::CONSTANT_SIZE_BLOCKS:
-       return find_template(begin1, end1, begin2, pred, selector, constant_size_blocks_tag());
+        return find_template(begin1, end1, begin2, pred, selector,
+                            constant_size_blocks_tag());
       case Settings::EQUAL_SPLIT:
-       return find_template(begin1, end1, begin2, pred, selector, equal_split_tag());
+        return find_template(begin1, end1, begin2, pred, selector,
+                            equal_split_tag());
       default:
-       _GLIBCXX_PARALLEL_ASSERT(false);
-       return std::make_pair(begin1, begin2);
+        _GLIBCXX_PARALLEL_ASSERT(false);
+        return std::make_pair(begin1, begin2);
       }
   }
 
 #if _GLIBCXX_FIND_EQUAL_SPLIT
 
-  /** 
-   *  @brief Parallel std::find, equal splitting variant.
-   *  @param begin1 Begin iterator of first sequence.
-   *  @param end1 End iterator of first sequence.
-   *  @param begin2 Begin iterator of second sequence. Second sequence
-   *  must have same length as first sequence.
-   *  @param pred Find predicate.
-   *  @param selector Functionality (e. g. std::find_if (), std::equal(),...)
-   *  @return Place of finding in both sequences. 
-   */
-  template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector>
+/**
+ *  @brief Parallel std::find, equal splitting variant.
+ *  @param begin1 Begin iterator of first sequence.
+ *  @param end1 End iterator of first sequence.
+ *  @param begin2 Begin iterator of second sequence. Second sequence
+ *  must have same length as first sequence.
+ *  @param pred Find predicate.
+ *  @param selector Functionality (e. g. std::find_if (), std::equal(),...)
+ *  @return Place of finding in both sequences.
+ */
+template<
+    typename RandomAccessIterator1,
+    typename RandomAccessIterator2,
+    typename Pred,
+    typename Selector>
   std::pair<RandomAccessIterator1, RandomAccessIterator2>
-  find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1, RandomAccessIterator2 begin2, Pred pred, Selector selector, equal_split_tag)
+  find_template(RandomAccessIterator1 begin1,
+                RandomAccessIterator1 end1,
+                RandomAccessIterator2 begin2,
+                Pred pred,
+                Selector selector,
+                equal_split_tag)
   {
     _GLIBCXX_CALL(end1 - begin1)
 
@@ -100,79 +116,89 @@ namespace __gnu_parallel
     typedef typename traits_type::value_type value_type;
 
     difference_type length = end1 - begin1;
-
     difference_type result = length;
+    difference_type* borders;
 
-    const thread_index_t num_threads = get_max_threads();
     omp_lock_t result_lock;
     omp_init_lock(&result_lock);
 
-    difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 1)));
-
-    equally_split(length, num_threads, borders);
-
-#pragma omp parallel shared(result) num_threads(num_threads)
-    {
-      int iam = omp_get_thread_num();
-      difference_type pos = borders[iam], limit = borders[iam + 1];
-
-      RandomAccessIterator1 i1 = begin1 + pos;
-      RandomAccessIterator2 i2 = begin2 + pos;
-      for (; pos < limit; pos++)
-       {
-#pragma omp flush(result)
-          // Result has been set to something lower.
-          if (result < pos)
-            break;
-
-          if (selector(i1, i2, pred))
-            {
-              omp_set_lock(&result_lock);
-              if (result > pos)
-                result = pos;
-              omp_unset_lock(&result_lock);
+    thread_index_t num_threads = get_max_threads();
+#   pragma omp parallel num_threads(num_threads)
+      {
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+            borders = new difference_type[num_threads + 1];
+            equally_split(length, num_threads, borders);
+          } //single
+
+        thread_index_t iam = omp_get_thread_num();
+        difference_type start = borders[iam], stop = borders[iam + 1];
+
+        RandomAccessIterator1 i1 = begin1 + start;
+        RandomAccessIterator2 i2 = begin2 + start;
+        for (difference_type pos = start; pos < stop; ++pos)
+          {
+            #pragma omp flush(result)
+            // Result has been set to something lower.
+            if (result < pos)
               break;
-            }
-          i1++;
-          i2++;
-        }
-    }
+
+            if (selector(i1, i2, pred))
+              {
+                omp_set_lock(&result_lock);
+                if (pos < result)
+                  result = pos;
+                omp_unset_lock(&result_lock);
+                break;
+              }
+            ++i1;
+            ++i2;
+          }
+      } //parallel
 
     omp_destroy_lock(&result_lock);
-    return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result);
+    delete[] borders;
+
+    return std::pair<RandomAccessIterator1, RandomAccessIterator2>(
+        begin1 + result, begin2 + result);
   }
 
 #endif
 
 #if _GLIBCXX_FIND_GROWING_BLOCKS
 
-  /** 
-   *  @brief Parallel std::find, growing block size variant.
-   *  @param begin1 Begin iterator of first sequence.
-   *  @param end1 End iterator of first sequence.
-   *  @param begin2 Begin iterator of second sequence. Second sequence
-   *  must have same length as first sequence.
-   *  @param pred Find predicate.
-   *  @param selector Functionality (e. g. std::find_if (), std::equal(),...)
-   *  @return Place of finding in both sequences.
-   *  @see __gnu_parallel::Settings::find_sequential_search_size
-   *  @see __gnu_parallel::Settings::find_initial_block_size
-   *  @see __gnu_parallel::Settings::find_maximum_block_size
-   *  @see __gnu_parallel::Settings::find_increasing_factor
-   *
-   *  There are two main differences between the growing blocks and
-   *  the constant-size blocks variants.
-   *  1. For GB, the block size grows; for CSB, the block size is fixed.
-
-   *  2. For GB, the blocks are allocated dynamically;
-   *     for CSB, the blocks are allocated in a predetermined manner,
-   *     namely spacial round-robin.
-   */
-  template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector>
+/**
+ *  @brief Parallel std::find, growing block size variant.
+ *  @param begin1 Begin iterator of first sequence.
+ *  @param end1 End iterator of first sequence.
+ *  @param begin2 Begin iterator of second sequence. Second sequence
+ *  must have same length as first sequence.
+ *  @param pred Find predicate.
+ *  @param selector Functionality (e. g. std::find_if (), std::equal(),...)
+ *  @return Place of finding in both sequences.
+ *  @see __gnu_parallel::Settings::find_sequential_search_size
+ *  @see __gnu_parallel::Settings::find_initial_block_size
+ *  @see __gnu_parallel::Settings::find_maximum_block_size
+ *  @see __gnu_parallel::Settings::find_increasing_factor
+ *
+ *  There are two main differences between the growing blocks and
+ *  the constant-size blocks variants.
+ *  1. For GB, the block size grows; for CSB, the block size is fixed.
+
+ *  2. For GB, the blocks are allocated dynamically;
+ *     for CSB, the blocks are allocated in a predetermined manner,
+ *     namely spacial round-robin.
+ */
+template<
+    typename RandomAccessIterator1,
+    typename RandomAccessIterator2,
+    typename Pred,
+    typename Selector>
   std::pair<RandomAccessIterator1, RandomAccessIterator2>
   find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
-               RandomAccessIterator2 begin2, Pred pred, Selector selector,
-               growing_blocks_tag)
+                RandomAccessIterator2 begin2, Pred pred, Selector selector,
+                growing_blocks_tag)
   {
     _GLIBCXX_CALL(end1 - begin1)
 
@@ -182,101 +208,118 @@ namespace __gnu_parallel
 
     difference_type length = end1 - begin1;
 
-    difference_type sequential_search_size = std::min<difference_type>(length, Settings::find_sequential_search_size);
+    difference_type sequential_search_size = std::min<difference_type>(
+        length, Settings::find_sequential_search_size);
 
     // Try it sequentially first.
     std::pair<RandomAccessIterator1, RandomAccessIterator2> find_seq_result =
-      selector.sequential_algorithm(begin1, begin1 + sequential_search_size, begin2, pred);
+      selector.sequential_algorithm(
+          begin1, begin1 + sequential_search_size, begin2, pred);
 
     if (find_seq_result.first != (begin1 + sequential_search_size))
       return find_seq_result;
 
     // Index of beginning of next free block (after sequential find).
-    difference_type next_block_pos = sequential_search_size;
+    difference_type next_block_start = sequential_search_size;
     difference_type result = length;
-    const thread_index_t num_threads = get_max_threads();
 
     omp_lock_t result_lock;
     omp_init_lock(&result_lock);
 
-#pragma omp parallel shared(result) num_threads(num_threads)
-    {
-      // Not within first k elements -> start parallel.
-      thread_index_t iam = omp_get_thread_num();
-
-      difference_type block_size = Settings::find_initial_block_size;
-      difference_type start = fetch_and_add<difference_type>(&next_block_pos, block_size);
-
-      // Get new block, update pointer to next block.
-      difference_type stop = std::min<difference_type>(length, start + block_size);
-
-      std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result;
-
-      while (start < length)
-       {
-#pragma omp flush(result)
-         // Get new value of result.
-         if (result < start)
-           {
-             // No chance to find first element.
-             break;
-           }
-
-         local_result = selector.sequential_algorithm(begin1 + start, begin1 + stop, begin2 + start, pred);
-         if (local_result.first != (begin1 + stop))
-           {
-              omp_set_lock(&result_lock);
-             if ((local_result.first - begin1) < result)
-               {
-                 result = local_result.first - begin1;
-
-                 // Result cannot be in future blocks, stop algorithm.
-                 fetch_and_add<difference_type>(&next_block_pos, length);
-               }
-              omp_unset_lock(&result_lock);
-           }
-
-         block_size = std::min<difference_type>(block_size * Settings::find_increasing_factor, Settings::find_maximum_block_size);
-
-         // Get new block, update pointer to next block.
-         start = fetch_and_add<difference_type>(&next_block_pos, block_size);
-         stop = (length < (start + block_size)) ? length : (start + block_size);
-       }
-    }
+    thread_index_t num_threads = get_max_threads();
+#   pragma omp parallel shared(result) num_threads(num_threads)
+      {
+#       pragma omp single
+          num_threads = omp_get_num_threads();
+
+        // Not within first k elements -> start parallel.
+        thread_index_t iam = omp_get_thread_num();
+
+        difference_type block_size = Settings::find_initial_block_size;
+        difference_type start =
+            fetch_and_add<difference_type>(&next_block_start, block_size);
+
+        // Get new block, update pointer to next block.
+        difference_type stop =
+            std::min<difference_type>(length, start + block_size);
+
+        std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result;
+
+        while (start < length)
+          {
+#           pragma omp flush(result)
+            // Get new value of result.
+            if (result < start)
+              {
+                // No chance to find first element.
+                break;
+              }
+
+            local_result = selector.sequential_algorithm(
+                begin1 + start, begin1 + stop, begin2 + start, pred);
+            if (local_result.first != (begin1 + stop))
+              {
+                omp_set_lock(&result_lock);
+                if ((local_result.first - begin1) < result)
+                  {
+                    result = local_result.first - begin1;
+
+                    // Result cannot be in future blocks, stop algorithm.
+                    fetch_and_add<difference_type>(&next_block_start, length);
+                  }
+                  omp_unset_lock(&result_lock);
+              }
+
+            block_size = std::min<difference_type>(
+                block_size * Settings::find_increasing_factor,
+                Settings::find_maximum_block_size);
+
+            // Get new block, update pointer to next block.
+            start =
+                fetch_and_add<difference_type>(&next_block_start, block_size);
+            stop = (length < (start + block_size)) ?
+                        length : (start + block_size);
+          }
+      } //parallel
 
     omp_destroy_lock(&result_lock);
 
     // Return iterator on found element.
-    return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result);
+    return std::pair<RandomAccessIterator1, RandomAccessIterator2>(
+        begin1 + result, begin2 + result);
   }
 
 #endif
 
 #if _GLIBCXX_FIND_CONSTANT_SIZE_BLOCKS
 
-  /** 
-   *   @brief Parallel std::find, constant block size variant.
-   *  @param begin1 Begin iterator of first sequence.
-   *  @param end1 End iterator of first sequence.
-   *  @param begin2 Begin iterator of second sequence. Second sequence
-   *  must have same length as first sequence.
-   *  @param pred Find predicate.
-   *  @param selector Functionality (e. g. std::find_if (), std::equal(),...)
-   *  @return Place of finding in both sequences.
-   *  @see __gnu_parallel::Settings::find_sequential_search_size
-   *  @see __gnu_parallel::Settings::find_block_size
-   *  There are two main differences between the growing blocks and the
-   *  constant-size blocks variants.
-   *  1. For GB, the block size grows; for CSB, the block size is fixed.
-   *  2. For GB, the blocks are allocated dynamically; for CSB, the
-   *  blocks are allocated in a predetermined manner, namely spacial
-   *  round-robin.
-   */
-  template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector>
+/**
+ *   @brief Parallel std::find, constant block size variant.
+ *  @param begin1 Begin iterator of first sequence.
+ *  @param end1 End iterator of first sequence.
+ *  @param begin2 Begin iterator of second sequence. Second sequence
+ *  must have same length as first sequence.
+ *  @param pred Find predicate.
+ *  @param selector Functionality (e. g. std::find_if (), std::equal(),...)
+ *  @return Place of finding in both sequences.
+ *  @see __gnu_parallel::Settings::find_sequential_search_size
+ *  @see __gnu_parallel::Settings::find_block_size
+ *  There are two main differences between the growing blocks and the
+ *  constant-size blocks variants.
+ *  1. For GB, the block size grows; for CSB, the block size is fixed.
+ *  2. For GB, the blocks are allocated dynamically; for CSB, the
+ *  blocks are allocated in a predetermined manner, namely spacial
+ *  round-robin.
+ */
+template<
+    typename RandomAccessIterator1,
+    typename RandomAccessIterator2,
+    typename Pred,
+    typename Selector>
   std::pair<RandomAccessIterator1, RandomAccessIterator2>
   find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
-               RandomAccessIterator2 begin2, Pred pred, Selector selector,
-               constant_size_blocks_tag)
+                RandomAccessIterator2 begin2, Pred pred, Selector selector,
+                constant_size_blocks_tag)
   {
     _GLIBCXX_CALL(end1 - begin1)
     typedef std::iterator_traits<RandomAccessIterator1> traits_type;
@@ -285,72 +328,77 @@ namespace __gnu_parallel
 
     difference_type length = end1 - begin1;
 
-    difference_type sequential_search_size = std::min<difference_type>(length, Settings::find_sequential_search_size);
+    difference_type sequential_search_size = std::min<difference_type>(
+        length, Settings::find_sequential_search_size);
 
     // Try it sequentially first.
     std::pair<RandomAccessIterator1, RandomAccessIterator2> find_seq_result =
-      selector.sequential_algorithm(begin1, begin1 + sequential_search_size, begin2, pred);
+      selector.sequential_algorithm(begin1, begin1 + sequential_search_size,
+                                    begin2, pred);
 
     if (find_seq_result.first != (begin1 + sequential_search_size))
       return find_seq_result;
 
     difference_type result = length;
-    const thread_index_t num_threads = get_max_threads();
-
     omp_lock_t result_lock;
     omp_init_lock(&result_lock);
 
     // Not within first sequential_search_size elements -> start parallel.
-#pragma omp parallel shared(result) num_threads(num_threads)
-    {
-      thread_index_t iam = omp_get_thread_num();
-      difference_type block_size = Settings::find_initial_block_size;
-
-      difference_type start, stop;
-
-      // First element of thread's current iteration.
-      difference_type iteration_start = sequential_search_size;
-
-      // Where to work (initialization).
-      start = iteration_start + iam * block_size;
-      stop = std::min<difference_type>(length, start + block_size);
-
-      std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result;
-
-      while (start < length)
-       {
-         // Get new value of result.
-#pragma omp flush(result)
-         // No chance to find first element.
-         if (result < start)
-           break;
-
-         local_result = selector.sequential_algorithm(begin1 + start, begin1 + stop, begin2 + start, pred);
-         if (local_result.first != (begin1 + stop))
-           {
-             omp_set_lock(&result_lock);
-             if ((local_result.first - begin1) < result)
-               result = local_result.first - begin1;
-              omp_unset_lock(&result_lock);
-             // Will not find better value in its interval.
-             break;
-           }
-
-         iteration_start += num_threads * block_size;
-
-         // Where to work.
-         start = iteration_start + iam * block_size;
-         stop = std::min<difference_type>(length, start + block_size);
-       }
-    }
+
+    thread_index_t num_threads = get_max_threads();
+#   pragma omp parallel shared(result) num_threads(num_threads)
+      {
+#       pragma omp single
+          num_threads = omp_get_num_threads();
+
+        thread_index_t iam = omp_get_thread_num();
+        difference_type block_size = Settings::find_initial_block_size;
+
+        // First element of thread's current iteration.
+        difference_type iteration_start = sequential_search_size;
+
+        // Where to work (initialization).
+        difference_type start = iteration_start + iam * block_size;
+        difference_type stop =
+            std::min<difference_type>(length, start + block_size);
+
+        std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result;
+
+        while (start < length)
+          {
+            // Get new value of result.
+#           pragma omp flush(result)
+            // No chance to find first element.
+            if (result < start)
+              break;
+            local_result = selector.sequential_algorithm(
+                begin1 + start, begin1 + stop,
+                begin2 + start, pred);
+            if (local_result.first != (begin1 + stop))
+              {
+                omp_set_lock(&result_lock);
+                if ((local_result.first - begin1) < result)
+                  result = local_result.first - begin1;
+                omp_unset_lock(&result_lock);
+                // Will not find better value in its interval.
+                break;
+              }
+
+            iteration_start += num_threads * block_size;
+
+            // Where to work.
+            start = iteration_start + iam * block_size;
+            stop = std::min<difference_type>(length, start + block_size);
+          }
+      } //parallel
 
     omp_destroy_lock(&result_lock);
 
     // Return iterator on found element.
-    return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result);
+    return std::pair<RandomAccessIterator1, RandomAccessIterator2>(
+        begin1 + result, begin2 + result);
   }
 #endif
 } // end namespace
 
 #endif
-
index 5c7a808..7b8b654 100644 (file)
@@ -29,9 +29,9 @@
 // Public License.
 
 /** @file parallel/losertree.h
- *  @brief Many generic loser tree variants.
- *  This file is a GNU parallel extension to the Standard C++ Library.
- */
+*  @brief Many generic loser tree variants.
+*  This file is a GNU parallel extension to the Standard C++ Library.
+*/
 
 // Written by Johannes Singler.
 
@@ -49,13 +49,13 @@ namespace __gnu_parallel
 
 #if _GLIBCXX_LOSER_TREE_EXPLICIT
 
-  /** @brief Guarded loser tree, copying the whole element into the
-   * tree structure.
-   *
-   *  Guarding is done explicitly through two flags per element, inf
-   *  and sup This is a quite slow variant.
-   */
-  template<typename T, typename Comparator = std::less<T> >
+/** @brief Guarded loser tree, copying the whole element into the
+* tree structure.
+*
+*  Guarding is done explicitly through two flags per element, inf
+*  and sup This is a quite slow variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
   class LoserTreeExplicit
   {
   private:
@@ -76,26 +76,25 @@ namespace __gnu_parallel
     Comparator comp;
 
   public:
-    inline LoserTreeExplicit(unsigned int _size, Comparator _comp = std::less<T>()) : comp(_comp)
+    inline
+    LoserTreeExplicit(unsigned int _size, Comparator _comp = std::less<T>())
+      : comp(_comp)
     {
       size = _size;
       offset = size;
       losers = new Loser[size];
       for (unsigned int l = 0; l < size; l++)
-       {
-         //losers[l].key = ...         stays unset
-         losers[l].inf = true;
-         losers[l].sup = false;
-         //losers[l].source = -1;      //sentinel
-       }
+        {
+          //losers[l].key = ...        stays unset
+          losers[l].inf = true;
+          losers[l].sup = false;
+          //losers[l].source = -1;     //sentinel
+        }
     }
 
     inline ~LoserTreeExplicit()
     { delete[] losers; }
 
-    inline void
-    print() { }
-
     inline int
     get_min_source()
     { return losers[0].source; }
@@ -105,16 +104,17 @@ namespace __gnu_parallel
     {
       bool inf = false;
       for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2)
-       {
-         if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup && comp(losers[pos].key, key)) || losers[pos].inf || sup)
-           {
-             // The other one is smaller.
-             std::swap(losers[pos].key, key);
-             std::swap(losers[pos].inf, inf);
-             std::swap(losers[pos].sup, sup);
-             std::swap(losers[pos].source, source);
-           }
-       }
+        {
+          if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup
+               && comp(losers[pos].key, key)) || losers[pos].inf || sup)
+            {
+              // The other one is smaller.
+              std::swap(losers[pos].key, key);
+              std::swap(losers[pos].inf, inf);
+              std::swap(losers[pos].sup, sup);
+              std::swap(losers[pos].source, source);
+            }
+        }
 
       losers[0].key = key;
       losers[0].inf = inf;
@@ -131,19 +131,19 @@ namespace __gnu_parallel
       bool inf = false;
       int source = losers[0].source;
       for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2)
-       {
-         // The smaller one gets promoted.
-         if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup 
-              && comp(losers[pos].key, key))
-             || losers[pos].inf || sup)
-           {
-             // The other one is smaller.
-             std::swap(losers[pos].key, key);
-             std::swap(losers[pos].inf, inf);
-             std::swap(losers[pos].sup, sup);
-             std::swap(losers[pos].source, source);
-           }
-       }
+        {
+          // The smaller one gets promoted.
+          if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup
+              && comp(losers[pos].key, key))
+              || losers[pos].inf || sup)
+            {
+              // The other one is smaller.
+              std::swap(losers[pos].key, key);
+              std::swap(losers[pos].inf, inf);
+              std::swap(losers[pos].sup, sup);
+              std::swap(losers[pos].source, source);
+            }
+        }
 
       losers[0].key = key;
       losers[0].inf = inf;
@@ -156,19 +156,19 @@ namespace __gnu_parallel
     {
       bool inf = false;
       for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2)
-       {
-         if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup &&
-              ((comp(losers[pos].key, key)) ||
-               (!comp(key, losers[pos].key) && losers[pos].source < source)))
-             || losers[pos].inf || sup)
-           {
-             // Take next key.
-             std::swap(losers[pos].key, key);
-             std::swap(losers[pos].inf, inf);
-             std::swap(losers[pos].sup, sup);
-             std::swap(losers[pos].source, source);
-           }
-       }
+        {
+          if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup &&
+              ((comp(losers[pos].key, key)) ||
+                (!comp(key, losers[pos].key) && losers[pos].source < source)))
+              || losers[pos].inf || sup)
+            {
+              // Take next key.
+              std::swap(losers[pos].key, key);
+              std::swap(losers[pos].inf, inf);
+              std::swap(losers[pos].sup, sup);
+              std::swap(losers[pos].source, source);
+            }
+        }
 
       losers[0].key = key;
       losers[0].inf = inf;
@@ -185,18 +185,18 @@ namespace __gnu_parallel
       bool inf = false;
       int source = losers[0].source;
       for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2)
-       {
-         if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup
-              && ((comp(losers[pos].key, key)) ||
-               (!comp(key, losers[pos].key) && losers[pos].source < source)))
-             || losers[pos].inf || sup)
-           {
-             std::swap(losers[pos].key, key);
-             std::swap(losers[pos].inf, inf);
-             std::swap(losers[pos].sup, sup);
-             std::swap(losers[pos].source, source);
-           }
-       }
+        {
+          if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup
+              && ((comp(losers[pos].key, key)) ||
+                (!comp(key, losers[pos].key) && losers[pos].source < source)))
+              || losers[pos].inf || sup)
+            {
+              std::swap(losers[pos].key, key);
+              std::swap(losers[pos].inf, inf);
+              std::swap(losers[pos].sup, sup);
+              std::swap(losers[pos].source, source);
+            }
+        }
 
       losers[0].key = key;
       losers[0].inf = inf;
@@ -209,14 +209,14 @@ namespace __gnu_parallel
 
 #if _GLIBCXX_LOSER_TREE
 
-  /** @brief Guarded loser tree, either copying the whole element into
-   * the tree structure, or looking up the element via the index.
-   *
-   *  Guarding is done explicitly through one flag sup per element,
-   *  inf is not needed due to a better initialization routine.  This
-   *  is a well-performing variant.
-   */
-  template<typename T, typename Comparator = std::less<T> >
+/** @brief Guarded loser tree, either copying the whole element into
+* the tree structure, or looking up the element via the index.
+*
+*  Guarding is done explicitly through one flag sup per element,
+*  inf is not needed due to a better initialization routine.  This
+*  is a well-performing variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
   class LoserTree
   {
   private:
@@ -240,22 +240,14 @@ namespace __gnu_parallel
       // Next greater power of 2.
       k = 1 << (log2(ik - 1) + 1);
       offset = k;
-      losers = new Loser[k * 2];
+      losers = static_cast<Loser*>(::operator new(k * 2 * sizeof(Loser)));
       for (unsigned int i = ik - 1; i < k; i++)
-       losers[i + k].sup = true;
+        losers[i + k].sup = true;
     }
 
     inline ~LoserTree()
     { delete[] losers; }
 
-    void
-    print()
-    {
-      for (unsigned int i = 0; i < (k * 2); i++)
-       printf("%d    %d from %d,  %d\n", i, losers[i].key, 
-              losers[i].source, losers[i].sup);
-    }
-
     inline int
     get_min_source()
     { return losers[0].source; }
@@ -267,33 +259,34 @@ namespace __gnu_parallel
 
       losers[pos].sup = sup;
       losers[pos].source = source;
-      losers[pos].key = key;
+      new(&(losers[pos].key)) T(key);
     }
 
     unsigned int
     init_winner (unsigned int root)
     {
       if (root >= k)
-       {
-         return root;
-       }
+        {
+          return root;
+        }
       else
-       {
-         unsigned int left = init_winner (2 * root);
-         unsigned int right = init_winner (2 * root + 1);
-         if (losers[right].sup ||
-             (!losers[left].sup && !comp(losers[right].key, losers[left].key)))
-           {
-             // Left one is less or equal.
-             losers[root] = losers[right];
-             return left;
-           }
-         else
-           {   // Right one is less.
-             losers[root] = losers[left];
-             return right;
-           }
-       }
+        {
+          unsigned int left = init_winner (2 * root);
+          unsigned int right = init_winner (2 * root + 1);
+          if (losers[right].sup ||
+              (!losers[left].sup
+                && !comp(losers[right].key, losers[left].key)))
+            {
+              // Left one is less or equal.
+              losers[root] = losers[right];
+              return left;
+            }
+          else
+            {  // Right one is less.
+              losers[root] = losers[left];
+              return right;
+            }
+        }
     }
 
     inline void
@@ -306,16 +299,16 @@ namespace __gnu_parallel
     {
       int source = losers[0].source;
       for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
-       {
-         // The smaller one gets promoted.
-         if (sup || (!losers[pos].sup && comp(losers[pos].key, key)))
-           {
-             // The other one is smaller.
-             std::swap(losers[pos].sup, sup);
-             std::swap(losers[pos].source, source);
-             std::swap(losers[pos].key, key);
-           }
-       }
+        {
+          // The smaller one gets promoted.
+          if (sup || (!losers[pos].sup && comp(losers[pos].key, key)))
+            {
+              // The other one is smaller.
+              std::swap(losers[pos].sup, sup);
+              std::swap(losers[pos].source, source);
+              std::swap(losers[pos].key, key);
+            }
+        }
 
       losers[0].sup = sup;
       losers[0].source = source;
@@ -330,27 +323,28 @@ namespace __gnu_parallel
     init_winner_stable (unsigned int root)
     {
       if (root >= k)
-       {
-         return root;
-       }
+        {
+          return root;
+        }
       else
-       {
-         unsigned int left = init_winner (2 * root);
-         unsigned int right = init_winner (2 * root + 1);
-         if (  losers[right].sup ||
-               (!losers[left].sup && !comp(losers[right].key, losers[left].key)))
-           {
-             // Left one is less or equal.
-             losers[root] = losers[right];
-             return left;
-           }
-         else
-           {
-             // Right one is less.
-             losers[root] = losers[left];
-             return right;
-           }
-       }
+        {
+          unsigned int left = init_winner (2 * root);
+          unsigned int right = init_winner (2 * root + 1);
+          if (losers[right].sup
+              || (!losers[left].sup
+                && !comp(losers[right].key, losers[left].key)))
+            {
+              // Left one is less or equal.
+              losers[root] = losers[right];
+              return left;
+            }
+          else
+            {
+              // Right one is less.
+              losers[root] = losers[left];
+              return right;
+            }
+        }
     }
 
     inline void
@@ -363,19 +357,20 @@ namespace __gnu_parallel
     {
       int source = losers[0].source;
       for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
-       {
-         // The smaller one gets promoted, ties are broken by source.
-         if (  (sup && (!losers[pos].sup || losers[pos].source < source)) ||
-               (!sup && !losers[pos].sup &&
-                ((comp(losers[pos].key, key)) ||
-                 (!comp(key, losers[pos].key) && losers[pos].source < source))))
-           {
-             // The other one is smaller.
-             std::swap(losers[pos].sup, sup);
-             std::swap(losers[pos].source, source);
-             std::swap(losers[pos].key, key);
-           }
-       }
+        {
+          // The smaller one gets promoted, ties are broken by source.
+          if ( (sup && (!losers[pos].sup || losers[pos].source < source))
+                || (!sup && !losers[pos].sup
+                  && ((comp(losers[pos].key, key))
+                    || (!comp(key, losers[pos].key)
+                      && losers[pos].source < source))))
+            {
+              // The other one is smaller.
+              std::swap(losers[pos].sup, sup);
+              std::swap(losers[pos].source, source);
+              std::swap(losers[pos].key, key);
+            }
+        }
 
       losers[0].sup = sup;
       losers[0].source = source;
@@ -387,14 +382,14 @@ namespace __gnu_parallel
 
 #if _GLIBCXX_LOSER_TREE_REFERENCE
 
-  /** @brief Guarded loser tree, either copying the whole element into
-   * the tree structure, or looking up the element via the index.
-   *
-   *  Guarding is done explicitly through one flag sup per element,
-   *  inf is not needed due to a better initialization routine.  This
-   *  is a well-performing variant.
-   */
-  template<typename T, typename Comparator = std::less<T> >
+/** @brief Guarded loser tree, either copying the whole element into
+* the tree structure, or looking up the element via the index.
+*
+*  Guarding is done explicitly through one flag sup per element,
+*  inf is not needed due to a better initialization routine.  This
+*  is a well-performing variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
   class LoserTreeReference
   {
 #undef COPY
@@ -423,7 +418,9 @@ namespace __gnu_parallel
     Comparator comp;
 
   public:
-    inline LoserTreeReference(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp)
+    inline
+    LoserTreeReference(unsigned int _k, Comparator _comp = std::less<T>())
+      : comp(_comp)
     {
       ik = _k;
 
@@ -435,7 +432,7 @@ namespace __gnu_parallel
       keys = new T[ik];
 #endif
       for (unsigned int i = ik - 1; i < k; i++)
-       losers[i + k].sup = true;
+        losers[i + k].sup = true;
     }
 
     inline ~LoserTreeReference()
@@ -446,13 +443,6 @@ namespace __gnu_parallel
 #endif
     }
 
-    void
-    print()
-    {
-      for (unsigned int i = 0; i < (k * 2); i++)
-       printf("%d    %d from %d,  %d\n", i, KEY(i), losers[i].source, losers[i].sup);
-    }
-
     inline int
     get_min_source()
     { return losers[0].source; }
@@ -471,27 +461,27 @@ namespace __gnu_parallel
     init_winner(unsigned int root)
     {
       if (root >= k)
-       {
-         return root;
-       }
+        {
+          return root;
+        }
       else
-       {
-         unsigned int left = init_winner (2 * root);
-         unsigned int right = init_winner (2 * root + 1);
-         if (  losers[right].sup ||
-               (!losers[left].sup && !comp(KEY(right), KEY(left))))
-           {
-             // Left one is less or equal.
-             losers[root] = losers[right];
-             return left;
-           }
-         else
-           {
-             // Right one is less.
-             losers[root] = losers[left];
-             return right;
-           }
-       }
+        {
+          unsigned int left = init_winner (2 * root);
+          unsigned int right = init_winner (2 * root + 1);
+          if ( losers[right].sup ||
+                (!losers[left].sup && !comp(KEY(right), KEY(left))))
+            {
+              // Left one is less or equal.
+              losers[root] = losers[right];
+              return left;
+            }
+          else
+            {
+              // Right one is less.
+              losers[root] = losers[left];
+              return right;
+            }
+        }
     }
 
     inline void
@@ -505,18 +495,18 @@ namespace __gnu_parallel
     {
       int source = losers[0].source;
       for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
-       {
-         // The smaller one gets promoted.
-         if (sup || (!losers[pos].sup && comp(KEY(pos), KEY_SOURCE(source))))
-           {
-             // The other one is smaller.
-             std::swap(losers[pos].sup, sup);
-             std::swap(losers[pos].source, source);
+        {
+          // The smaller one gets promoted.
+          if (sup || (!losers[pos].sup && comp(KEY(pos), KEY_SOURCE(source))))
+            {
+              // The other one is smaller.
+              std::swap(losers[pos].sup, sup);
+              std::swap(losers[pos].source, source);
 #ifdef COPY
-             std::swap(KEY(pos), KEY_SOURCE(source));
+              std::swap(KEY(pos), KEY_SOURCE(source));
 #endif
-           }
-       }
+            }
+        }
 
       losers[0].sup = sup;
       losers[0].source = source;
@@ -533,27 +523,27 @@ namespace __gnu_parallel
     init_winner_stable(unsigned int root)
     {
       if (root >= k)
-       {
-         return root;
-       }
+        {
+          return root;
+        }
       else
-       {
-         unsigned int left = init_winner (2 * root);
-         unsigned int right = init_winner (2 * root + 1);
-         if (losers[right].sup
-             || (!losers[left].sup && !comp(KEY(right), KEY(left))))
-           {
-             // Left one is less or equal.
-             losers[root] = losers[right];
-             return left;
-           }
-         else
-           {
-             // Right one is less.
-             losers[root] = losers[left];
-             return right;
-           }
-       }
+        {
+          unsigned int left = init_winner (2 * root);
+          unsigned int right = init_winner (2 * root + 1);
+          if (losers[right].sup
+              || (!losers[left].sup && !comp(KEY(right), KEY(left))))
+            {
+              // Left one is less or equal.
+              losers[root] = losers[right];
+              return left;
+            }
+          else
+            {
+              // Right one is less.
+              losers[root] = losers[left];
+              return right;
+            }
+        }
     }
 
     inline void
@@ -565,21 +555,22 @@ namespace __gnu_parallel
     {
       int source = losers[0].source;
       for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
-       {
-         // The smaller one gets promoted, ties are broken by source.
-         if (  (sup && (!losers[pos].sup || losers[pos].source < source)) ||
-               (!sup && !losers[pos].sup &&
-                ((comp(KEY(pos), KEY_SOURCE(source))) ||
-                 (!comp(KEY_SOURCE(source), KEY(pos)) && losers[pos].source < source))))
-           {
-             // The other one is smaller.
-             std::swap(losers[pos].sup, sup);
-             std::swap(losers[pos].source, source);
+        {
+          // The smaller one gets promoted, ties are broken by source.
+          if ( (sup && (!losers[pos].sup || losers[pos].source < source)) ||
+                (!sup && !losers[pos].sup &&
+                ((comp(KEY(pos), KEY_SOURCE(source))) ||
+                  (!comp(KEY_SOURCE(source), KEY(pos))
+                    && losers[pos].source < source))))
+            {
+              // The other one is smaller.
+              std::swap(losers[pos].sup, sup);
+              std::swap(losers[pos].source, source);
 #ifdef COPY
-             std::swap(KEY(pos), KEY_SOURCE(source));
+              std::swap(KEY(pos), KEY_SOURCE(source));
 #endif
-           }
-       }
+            }
+        }
 
       losers[0].sup = sup;
       losers[0].source = source;
@@ -595,13 +586,13 @@ namespace __gnu_parallel
 
 #if _GLIBCXX_LOSER_TREE_POINTER
 
-  /** @brief Guarded loser tree, either copying the whole element into
-      the tree structure, or looking up the element via the index.
-   *  Guarding is done explicitly through one flag sup per element,
-   *  inf is not needed due to a better initialization routine.
-   *  This is a well-performing variant.
-   */
-  template<typename T, typename Comparator = std::less<T> >
+/** @brief Guarded loser tree, either copying the whole element into
+    the tree structure, or looking up the element via the index.
+*  Guarding is done explicitly through one flag sup per element,
+*  inf is not needed due to a better initialization routine.
+*  This is a well-performing variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
   class LoserTreePointer
   {
   private:
@@ -617,7 +608,9 @@ namespace __gnu_parallel
     Comparator comp;
 
   public:
-    inline LoserTreePointer(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp)
+    inline
+    LoserTreePointer(unsigned int _k, Comparator _comp = std::less<T>())
+      : comp(_comp)
     {
       ik = _k;
 
@@ -626,19 +619,12 @@ namespace __gnu_parallel
       offset = k;
       losers = new Loser[k * 2];
       for (unsigned int i = ik - 1; i < k; i++)
-       losers[i + k].sup = true;
+        losers[i + k].sup = true;
     }
 
     inline ~LoserTreePointer()
     { delete[] losers; }
 
-    void
-    print()
-    {
-      for (unsigned int i = 0; i < (k * 2); i++)
-       printf("%d    %d from %d,  %d\n", i, losers[i].keyp, losers[i].source, losers[i].sup);
-    }
-
     inline int
     get_min_source()
     { return losers[0].source; }
@@ -657,49 +643,50 @@ namespace __gnu_parallel
     init_winner(unsigned int root)
     {
       if (root >= k)
-       {
-         return root;
-       }
+        {
+          return root;
+        }
       else
-       {
-         unsigned int left = init_winner (2 * root);
-         unsigned int right = init_winner (2 * root + 1);
-         if (  losers[right].sup ||
-               (!losers[left].sup && !comp(*losers[right].keyp, *losers[left].keyp)))
-           {
-             // Left one is less or equal.
-             losers[root] = losers[right];
-             return left;
-           }
-         else
-           {
-             // Right one is less.
-             losers[root] = losers[left];
-             return right;
-           }
-       }
+        {
+          unsigned int left = init_winner (2 * root);
+          unsigned int right = init_winner (2 * root + 1);
+          if (losers[right].sup
+                || (!losers[left].sup
+                  && !comp(*losers[right].keyp, *losers[left].keyp)))
+            {
+              // Left one is less or equal.
+              losers[root] = losers[right];
+              return left;
+            }
+          else
+            {
+              // Right one is less.
+              losers[root] = losers[left];
+              return right;
+            }
+        }
     }
 
     inline void
     init()
     { losers[0] = losers[init_winner(1)]; }
 
-    inline void 
+    inline void
     delete_min_insert(const T& key, bool sup)
     {
       const T* keyp = &key;
       int source = losers[0].source;
       for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
-       {
-         // The smaller one gets promoted.
-         if (sup || (!losers[pos].sup && comp(*losers[pos].keyp, *keyp)))
-           {
-             // The other one is smaller.
-             std::swap(losers[pos].sup, sup);
-             std::swap(losers[pos].source, source);
-             std::swap(losers[pos].keyp, keyp);
-           }
-       }
+        {
+          // The smaller one gets promoted.
+          if (sup || (!losers[pos].sup && comp(*losers[pos].keyp, *keyp)))
+            {
+              // The other one is smaller.
+              std::swap(losers[pos].sup, sup);
+              std::swap(losers[pos].source, source);
+              std::swap(losers[pos].keyp, keyp);
+            }
+        }
 
       losers[0].sup = sup;
       losers[0].source = source;
@@ -714,28 +701,28 @@ namespace __gnu_parallel
     init_winner_stable(unsigned int root)
     {
       if (root >= k)
-       {
-         return root;
-       }
+        {
+          return root;
+        }
       else
-       {
-         unsigned int left = init_winner (2 * root);
-         unsigned int right = init_winner (2 * root + 1);
-         if (losers[right].sup
-             || (!losers[left].sup && !comp(*losers[right].keyp, 
-                                            *losers[left].keyp)))
-           {
-             // Left one is less or equal.
-             losers[root] = losers[right];
-             return left;
-           }
-         else
-           {
-             // Right one is less.
-             losers[root] = losers[left];
-             return right;
-           }
-       }
+        {
+          unsigned int left = init_winner (2 * root);
+          unsigned int right = init_winner (2 * root + 1);
+          if (losers[right].sup
+              || (!losers[left].sup && !comp(*losers[right].keyp,
+                                            *losers[left].keyp)))
+            {
+              // Left one is less or equal.
+              losers[root] = losers[right];
+              return left;
+            }
+          else
+            {
+              // Right one is less.
+              losers[root] = losers[left];
+              return right;
+            }
+        }
     }
 
     inline void
@@ -748,20 +735,20 @@ namespace __gnu_parallel
       const T* keyp = &key;
       int source = losers[0].source;
       for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
-       {
-         // The smaller one gets promoted, ties are broken by source.
-         if (  (sup && (!losers[pos].sup || losers[pos].source < source)) ||
-               (!sup && !losers[pos].sup &&
-                ((comp(*losers[pos].keyp, *keyp)) ||
-                 (!comp(*keyp, *losers[pos].keyp) 
-                  && losers[pos].source < source))))
-           {
-             // The other one is smaller.
-             std::swap(losers[pos].sup, sup);
-             std::swap(losers[pos].source, source);
-             std::swap(losers[pos].keyp, keyp);
-           }
-       }
+        {
+          // The smaller one gets promoted, ties are broken by source.
+          if ( (sup && (!losers[pos].sup || losers[pos].source < source)) ||
+                (!sup && !losers[pos].sup &&
+                ((comp(*losers[pos].keyp, *keyp)) ||
+                  (!comp(*keyp, *losers[pos].keyp)
+                  && losers[pos].source < source))))
+            {
+              // The other one is smaller.
+              std::swap(losers[pos].sup, sup);
+              std::swap(losers[pos].source, source);
+              std::swap(losers[pos].keyp, keyp);
+            }
+        }
 
       losers[0].sup = sup;
       losers[0].source = source;
@@ -773,13 +760,13 @@ namespace __gnu_parallel
 
 #if _GLIBCXX_LOSER_TREE_UNGUARDED
 
-  /** @brief Unguarded loser tree, copying the whole element into the
-   * tree structure.
-   *
-   *  No guarding is done, therefore not a single input sequence must
-   *  run empty.  This is a very fast variant.
-   */
-  template<typename T, typename Comparator = std::less<T> >
+/** @brief Unguarded loser tree, copying the whole element into the
+* tree structure.
+*
+*  No guarding is done, therefore not a single input sequence must
+*  run empty.  This is a very fast variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
   class LoserTreeUnguarded
   {
   private:
@@ -798,18 +785,20 @@ namespace __gnu_parallel
     map(unsigned int root, unsigned int begin, unsigned int end)
     {
       if (begin + 1 == end)
-       mapping[begin] = root;
+        mapping[begin] = root;
       else
-       {
-         // Next greater or equal power of 2.
-         unsigned int left = 1 << (log2(end - begin - 1));
-         map(root * 2, begin, begin + left);
-         map(root * 2 + 1, begin + left, end);
-       }
+        {
+          // Next greater or equal power of 2.
+          unsigned int left = 1 << (log2(end - begin - 1));
+          map(root * 2, begin, begin + left);
+          map(root * 2 + 1, begin + left, end);
+        }
     }
 
   public:
-    inline LoserTreeUnguarded(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp)
+    inline
+    LoserTreeUnguarded(unsigned int _k, Comparator _comp = std::less<T>())
+      : comp(_comp)
     {
       ik = _k;
       // Next greater or equal power of 2.
@@ -826,13 +815,6 @@ namespace __gnu_parallel
       delete[] mapping;
     }
 
-    void
-    print()
-    {
-      for (unsigned int i = 0; i < k + ik; i++)
-       printf("%d    %d from %d\n", i, losers[i].key, losers[i].source);
-    }
-
     inline int
     get_min_source()
     { return losers[0].source; }
@@ -849,26 +831,27 @@ namespace __gnu_parallel
     init_winner(unsigned int root, unsigned int begin, unsigned int end)
     {
       if (begin + 1 == end)
-       return mapping[begin];
+        return mapping[begin];
       else
-       {
-         // Next greater or equal power of 2.
-         unsigned int division = 1 << (log2(end - begin - 1));
-         unsigned int left = init_winner(2 * root, begin, begin + division);
-         unsigned int right = init_winner(2 * root + 1, begin + division, end);
-         if (!comp(losers[right].key, losers[left].key))
-           {
-             // Left one is less or equal.
-             losers[root] = losers[right];
-             return left;
-           }
-         else
-           {
-             // Right one is less.
-             losers[root] = losers[left];
-             return right;
-           }
-       }
+        {
+          // Next greater or equal power of 2.
+          unsigned int division = 1 << (log2(end - begin - 1));
+          unsigned int left = init_winner(2 * root, begin, begin + division);
+          unsigned int right =
+                          init_winner(2 * root + 1, begin + division, end);
+          if (!comp(losers[right].key, losers[left].key))
+            {
+              // Left one is less or equal.
+              losers[root] = losers[right];
+              return left;
+            }
+          else
+            {
+              // Right one is less.
+              losers[root] = losers[left];
+              return right;
+            }
+        }
     }
 
     inline void
@@ -883,15 +866,15 @@ namespace __gnu_parallel
       T& keyr = losers[0].key;
       int& source = losers[0].source;
       for (int pos = mapping[source] / 2; pos > 0; pos /= 2)
-       {
-         // The smaller one gets promoted.
-         if (comp(losers[pos].key, keyr))
-           {
-             // The other one is smaller.
-             std::swap(losers[pos].source, source);
-             std::swap(losers[pos].key, keyr);
-           }
-       }
+        {
+          // The smaller one gets promoted.
+          if (comp(losers[pos].key, keyr))
+            {
+              // The other one is smaller.
+              std::swap(losers[pos].source, source);
+              std::swap(losers[pos].key, keyr);
+            }
+        }
     }
 
     inline void
@@ -909,16 +892,17 @@ namespace __gnu_parallel
       T& keyr = losers[0].key;
       int& source = losers[0].source;
       for (int pos = mapping[source] / 2; pos > 0; pos /= 2)
-       {
-         // The smaller one gets promoted, ties are broken by source.
-         if (comp(losers[pos].key, keyr)
-             || (!comp(keyr, losers[pos].key) && losers[pos].source < source))
-           {
-             // The other one is smaller.
-             std::swap(losers[pos].source, source);
-             std::swap(losers[pos].key, keyr);
-           }
-       }
+        {
+          // The smaller one gets promoted, ties are broken by source.
+          if (comp(losers[pos].key, keyr)
+              || (!comp(keyr, losers[pos].key)
+                && losers[pos].source < source))
+            {
+              // The other one is smaller.
+              std::swap(losers[pos].source, source);
+              std::swap(losers[pos].key, keyr);
+            }
+        }
     }
   };
 
@@ -926,13 +910,13 @@ namespace __gnu_parallel
 
 #if _GLIBCXX_LOSER_TREE_POINTER_UNGUARDED
 
-  /** @brief Unguarded loser tree, keeping only pointers to the
-   * elements in the tree structure.
-   *
-   *  No guarding is done, therefore not a single input sequence must
-   *  run empty.  This is a very fast variant.
-   */
-  template<typename T, typename Comparator = std::less<T> >
+/** @brief Unguarded loser tree, keeping only pointers to the
+* elements in the tree structure.
+*
+*  No guarding is done, therefore not a single input sequence must
+*  run empty.  This is a very fast variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
   class LoserTreePointerUnguarded
   {
   private:
@@ -950,18 +934,21 @@ namespace __gnu_parallel
     void map(unsigned int root, unsigned int begin, unsigned int end)
     {
       if (begin + 1 == end)
-       mapping[begin] = root;
+        mapping[begin] = root;
       else
-       {
-         // Next greater or equal power of 2.
-         unsigned int left = 1 << (log2(end - begin - 1));
-         map(root * 2, begin, begin + left);
-         map(root * 2 + 1, begin + left, end);
-       }
+        {
+          // Next greater or equal power of 2.
+          unsigned int left = 1 << (log2(end - begin - 1));
+          map(root * 2, begin, begin + left);
+          map(root * 2 + 1, begin + left, end);
+        }
     }
 
   public:
-    inline LoserTreePointerUnguarded(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp)
+    inline
+    LoserTreePointerUnguarded(unsigned int _k,
+                              Comparator _comp = std::less<T>())
+      : comp(_comp)
     {
       ik = _k;
 
@@ -979,13 +966,6 @@ namespace __gnu_parallel
       delete[] mapping;
     }
 
-    void
-    print()
-    {
-      for (unsigned int i = 0; i < k + ik; i++)
-       printf("%d    %d from %d\n", i, *losers[i].keyp, losers[i].source);
-    }
-
     inline int
     get_min_source()
     { return losers[0].source; }
@@ -1002,26 +982,27 @@ namespace __gnu_parallel
     init_winner(unsigned int root, unsigned int begin, unsigned int end)
     {
       if (begin + 1 == end)
-       return mapping[begin];
+        return mapping[begin];
       else
-       {
-         // Next greater or equal power of 2.
-         unsigned int division = 1 << (log2(end - begin - 1));
-         unsigned int left = init_winner(2 * root, begin, begin + division);
-         unsigned int right = init_winner(2 * root + 1, begin + division, end);
-         if (!comp(*losers[right].keyp, *losers[left].keyp))
-           {
-             // Left one is less or equal.
-             losers[root] = losers[right];
-             return left;
-           }
-         else
-           {
-             // Right one is less.
-             losers[root] = losers[left];
-             return right;
-           }
-       }
+        {
+          // Next greater or equal power of 2.
+          unsigned int division = 1 << (log2(end - begin - 1));
+          unsigned int left = init_winner(2 * root, begin, begin + division);
+          unsigned int right
+                          = init_winner(2 * root + 1, begin + division, end);
+          if (!comp(*losers[right].keyp, *losers[left].keyp))
+            {
+              // Left one is less or equal.
+              losers[root] = losers[right];
+              return left;
+            }
+          else
+            {
+              // Right one is less.
+              losers[root] = losers[left];
+              return right;
+            }
+        }
     }
 
     inline void
@@ -1036,15 +1017,15 @@ namespace __gnu_parallel
       const T* keyp = &key;
       int& source = losers[0].source;
       for (int pos = mapping[source] / 2; pos > 0; pos /= 2)
-       {
-         // The smaller one gets promoted.
-         if (comp(*losers[pos].keyp, *keyp))
-           {
-             // The other one is smaller.
-             std::swap(losers[pos].source, source);
-             std::swap(losers[pos].keyp, keyp);
-           }
-       }
+        {
+          // The smaller one gets promoted.
+          if (comp(*losers[pos].keyp, *keyp))
+            {
+              // The other one is smaller.
+              std::swap(losers[pos].source, source);
+              std::swap(losers[pos].keyp, keyp);
+            }
+        }
 
       losers[0].keyp = keyp;
     }
@@ -1063,23 +1044,23 @@ namespace __gnu_parallel
       int& source = losers[0].source;
       const T* keyp = &key;
       for (int pos = mapping[source] / 2; pos > 0; pos /= 2)
-       {
-         // The smaller one gets promoted, ties are broken by source.
-         if (comp(*losers[pos].keyp, *keyp)
-             || (!comp(*keyp, *losers[pos].keyp) 
-                 && losers[pos].source < source))
-           {
-             // The other one is smaller.
-             std::swap(losers[pos].source, source);
-             std::swap(losers[pos].keyp, keyp);
-           }
-       }
+        {
+          // The smaller one gets promoted, ties are broken by source.
+          if (comp(*losers[pos].keyp, *keyp)
+              || (!comp(*keyp, *losers[pos].keyp)
+                  && losers[pos].source < source))
+            {
+              // The other one is smaller.
+              std::swap(losers[pos].source, source);
+              std::swap(losers[pos].keyp, keyp);
+            }
+        }
       losers[0].keyp = keyp;
     }
   };
 #endif
 
-  template<typename _ValueTp, class Comparator>
+template<typename _ValueTp, class Comparator>
   struct loser_tree_traits
   {
 #if _GLIBCXX_LOSER_TREE
@@ -1093,7 +1074,7 @@ namespace __gnu_parallel
 #endif
   };
 
-  template<typename _ValueTp, class Comparator>
+template<typename _ValueTp, class Comparator>
   struct loser_tree_unguarded_traits
   {
 #if _GLIBCXX_LOSER_TREE_UNGUARDED
index 2e3be7c..67f2f8c 100644 (file)
 // Public License.
 
 /** @file parallel/multiway_merge.h
- *  @brief Implementation of sequential and parallel multiway merge.
- *
- *  Explanations on the high-speed merging routines in the appendix of
- *
- *  P. Sanders.
- *  Fast priority queues for cached memory.
- *  ACM Journal of Experimental Algorithmics, 5, 2000.
- *
- *  This file is a GNU parallel extension to the Standard C++ Library.
- */
+*  @brief Implementation of sequential and parallel multiway merge.
+*
+*  Explanations on the high-speed merging routines in the appendix of
+*
+*  P. Sanders.
+*  Fast priority queues for cached memory.
+*  ACM Journal of Experimental Algorithmics, 5, 2000.
+*
+*  This file is a GNU parallel extension to the Standard C++ Library.
+*/
 
 // Written by Johannes Singler.
 
 // XXX need iterator typedefs
 namespace __gnu_parallel
 {
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
   class guarded_iterator;
 
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
   inline bool
   operator<(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
-           guarded_iterator<RandomAccessIterator, Comparator>& bi2);
+            guarded_iterator<RandomAccessIterator, Comparator>& bi2);
 
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
   inline bool
   operator<=(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
-            guarded_iterator<RandomAccessIterator, Comparator>& bi2);
+            guarded_iterator<RandomAccessIterator, Comparator>& bi2);
 
   /** @brief Iterator wrapper supporting an implicit supremum at the end
       of the sequence, dominating all comparisons.
       *  Deriving from RandomAccessIterator is not possible since
       *  RandomAccessIterator need not be a class.
       */
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
   class guarded_iterator
   {
   private:
@@ -95,17 +95,17 @@ namespace __gnu_parallel
 
   public:
     /** @brief Constructor. Sets iterator to beginning of sequence.
-     *  @param begin Begin iterator of sequence.
-     *  @param end End iterator of sequence.
-     *  @param comp Comparator provided for associated overloaded
-     *  compare operators. */
-    inline guarded_iterator(RandomAccessIterator begin, 
-                           RandomAccessIterator end, Comparator& comp) 
+    *  @param begin Begin iterator of sequence.
+    *  @param end End iterator of sequence.
+    *  @param comp Comparator provided for associated overloaded
+    *  compare operators. */
+    inline guarded_iterator(RandomAccessIterator begin,
+                            RandomAccessIterator end, Comparator& comp)
     : current(begin), end(end), comp(comp)
     { }
 
     /** @brief Pre-increment operator.
-     *  @return This. */
+    *  @return This. */
     inline guarded_iterator<RandomAccessIterator, Comparator>&
     operator++()
     {
@@ -114,31 +114,35 @@ namespace __gnu_parallel
     }
 
     /** @brief Dereference operator.
-     *  @return Referenced element. */
+    *  @return Referenced element. */
     inline typename std::iterator_traits<RandomAccessIterator>::value_type
     operator*()
     { return *current; }
 
     /** @brief Convert to wrapped iterator.
-     *  @return Wrapped iterator. */
+    *  @return Wrapped iterator. */
     inline operator RandomAccessIterator()
     { return current; }
 
     friend bool
-    operator< <RandomAccessIterator, Comparator>(guarded_iterator<RandomAccessIterator, Comparator>& bi1, guarded_iterator<RandomAccessIterator, Comparator>& bi2);
+    operator< <RandomAccessIterator, Comparator>(
+        guarded_iterator<RandomAccessIterator, Comparator>& bi1,
+        guarded_iterator<RandomAccessIterator, Comparator>& bi2);
 
     friend bool
-    operator<= <RandomAccessIterator, Comparator>(guarded_iterator<RandomAccessIterator, Comparator>& bi1, guarded_iterator<RandomAccessIterator, Comparator>& bi2);
+    operator<= <RandomAccessIterator, Comparator>(
+        guarded_iterator<RandomAccessIterator, Comparator>& bi1,
+        guarded_iterator<RandomAccessIterator, Comparator>& bi2);
   };
 
-  /** @brief Compare two elements referenced by guarded iterators.
  *  @param bi1 First iterator.
  *  @param bi2 Second iterator.
  *  @return @c True if less. */
-  template<typename RandomAccessIterator, typename Comparator>
+/** @brief Compare two elements referenced by guarded iterators.
+ *  @param bi1 First iterator.
+ *  @param bi2 Second iterator.
+ *  @return @c True if less. */
+template<typename RandomAccessIterator, typename Comparator>
   inline bool
   operator<(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
-           guarded_iterator<RandomAccessIterator, Comparator>& bi2)
+            guarded_iterator<RandomAccessIterator, Comparator>& bi2)
   {
     if (bi1.current == bi1.end)        //bi1 is sup
       return bi2.current == bi2.end;   //bi2 is not sup
@@ -147,14 +151,14 @@ namespace __gnu_parallel
     return (bi1.comp)(*bi1, *bi2);     //normal compare
   }
 
-  /** @brief Compare two elements referenced by guarded iterators.
  *  @param bi1 First iterator.
  *  @param bi2 Second iterator.
  *  @return @c True if less equal. */
-  template<typename RandomAccessIterator, typename Comparator>
+/** @brief Compare two elements referenced by guarded iterators.
+ *  @param bi1 First iterator.
+ *  @param bi2 Second iterator.
+ *  @return @c True if less equal. */
+template<typename RandomAccessIterator, typename Comparator>
   inline bool
   operator<=(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
-            guarded_iterator<RandomAccessIterator, Comparator>& bi2)
+            guarded_iterator<RandomAccessIterator, Comparator>& bi2)
   {
     if (bi2.current == bi2.end)        //bi1 is sup
       return bi1.current != bi1.end;   //bi2 is not sup
@@ -163,20 +167,20 @@ namespace __gnu_parallel
     return !(bi1.comp)(*bi2, *bi1);    //normal compare
   }
 
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
   class unguarded_iterator;
 
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
   inline bool
   operator<(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
-           unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
+            unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
 
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
   inline bool
   operator<=(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
-            unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
+             unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
 
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
   class unguarded_iterator
   {
   private:
@@ -187,16 +191,16 @@ namespace __gnu_parallel
 
   public:
     /** @brief Constructor. Sets iterator to beginning of sequence.
-     *  @param begin Begin iterator of sequence.
-     *  @param end Unused, only for compatibility.
-     *  @param comp Unused, only for compatibility. */
-    inline unguarded_iterator(RandomAccessIterator begin, 
-                             RandomAccessIterator end, Comparator& comp) 
+    *  @param begin Begin iterator of sequence.
+    *  @param end Unused, only for compatibility.
+    *  @param comp Unused, only for compatibility. */
+    inline unguarded_iterator(RandomAccessIterator begin,
+                              RandomAccessIterator end, Comparator& comp)
     : current(begin), comp(comp)
     { }
 
     /** @brief Pre-increment operator.
-     *  @return This. */
+    *  @return This. */
     inline  unguarded_iterator<RandomAccessIterator, Comparator>&
     operator++()
     {
@@ -205,77 +209,85 @@ namespace __gnu_parallel
     }
 
     /** @brief Dereference operator.
-     *  @return Referenced element. */
-    inline typename std::iterator_traits<RandomAccessIterator>::value_type 
+    *  @return Referenced element. */
+    inline typename std::iterator_traits<RandomAccessIterator>::value_type
     operator*()
     { return *current; }
 
     /** @brief Convert to wrapped iterator.
-     *  @return Wrapped iterator. */
+    *  @return Wrapped iterator. */
     inline
     operator RandomAccessIterator()
     { return current; }
 
     friend bool
-    operator< <RandomAccessIterator, Comparator>(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
+    operator< <RandomAccessIterator, Comparator>(
+        unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
+        unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
 
     friend bool
-    operator<= <RandomAccessIterator, Comparator>(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
+    operator<= <RandomAccessIterator, Comparator>(
+        unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
+        unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
   };
 
-  /** @brief Compare two elements referenced by unguarded iterators.
  *  @param bi1 First iterator.
  *  @param bi2 Second iterator.
  *  @return @c True if less. */
-  template<typename RandomAccessIterator, typename Comparator>
+/** @brief Compare two elements referenced by unguarded iterators.
+ *  @param bi1 First iterator.
+ *  @param bi2 Second iterator.
+ *  @return @c True if less. */
+template<typename RandomAccessIterator, typename Comparator>
   inline bool
   operator<(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
-           unguarded_iterator<RandomAccessIterator, Comparator>& bi2)
+            unguarded_iterator<RandomAccessIterator, Comparator>& bi2)
   {
     // Normal compare.
     return (bi1.comp)(*bi1, *bi2);
   }
 
-  /** @brief Compare two elements referenced by unguarded iterators.
  *  @param bi1 First iterator.
  *  @param bi2 Second iterator.
  *  @return @c True if less equal. */
-  template<typename RandomAccessIterator, typename Comparator>
+/** @brief Compare two elements referenced by unguarded iterators.
+ *  @param bi1 First iterator.
+ *  @param bi2 Second iterator.
+ *  @return @c True if less equal. */
+template<typename RandomAccessIterator, typename Comparator>
   inline bool
   operator<=(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
-            unguarded_iterator<RandomAccessIterator, Comparator>& bi2)
+            unguarded_iterator<RandomAccessIterator, Comparator>& bi2)
   {
     // Normal compare.
     return !(bi1.comp)(*bi2, *bi1);
   }
 
-  /** Prepare a set of sequences to be merged without a (end) guard
-   *  @param seqs_begin
-   *  @param seqs_end
-   *  @param comp
-   *  @param min_sequence
-   *  @param stable
-   *  @pre (seqs_end - seqs_begin > 0) */
-  template<typename RandomAccessIteratorIterator, typename Comparator>
-  typename std::iterator_traits<typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type>::difference_type
+/** Prepare a set of sequences to be merged without a (end) guard
+ *  @param seqs_begin
+ *  @param seqs_end
+ *  @param comp
+ *  @param min_sequence
+ *  @param stable
+ *  @pre (seqs_end - seqs_begin > 0) */
+template<typename RandomAccessIteratorIterator, typename Comparator>
+  typename std::iterator_traits<
+      typename std::iterator_traits<RandomAccessIteratorIterator>::value_type
+      ::first_type>::difference_type
   prepare_unguarded(RandomAccessIteratorIterator seqs_begin,
-                   RandomAccessIteratorIterator seqs_end, Comparator comp,
-                   int& min_sequence, bool stable)
+                    RandomAccessIteratorIterator seqs_end, Comparator comp,
+                    int& min_sequence, bool stable)
   {
     _GLIBCXX_CALL(seqs_end - seqs_begin)
 
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+        ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
-    typedef typename std::iterator_traits<RandomAccessIterator1>::difference_type
+    typedef typename std::iterator_traits<RandomAccessIterator1>
+      ::difference_type
       difference_type;
 
     if ((*seqs_begin).first == (*seqs_begin).second)
       {
-       // Empty sequence found, it's the first one.
-       min_sequence = 0;
-       return -1;
+        // Empty sequence found, it's the first one.
+        min_sequence = 0;
+        return -1;
       }
 
     // Last element in sequence.
@@ -283,20 +295,20 @@ namespace __gnu_parallel
     min_sequence = 0;
     for (RandomAccessIteratorIterator s = seqs_begin + 1; s != seqs_end; s++)
       {
-       if ((*s).first == (*s).second)
-         {
-           // Empty sequence found.
-           min_sequence = static_cast<int>(s - seqs_begin);
-           return -1;
-         }
-
-       // Last element in sequence.
-       const value_type& v = *((*s).second - 1);
-       if (comp(v, min))       //strictly smaller
-         {
-           min = v;
-           min_sequence = static_cast<int>(s - seqs_begin);
-         }
+        if ((*s).first == (*s).second)
+          {
+            // Empty sequence found.
+            min_sequence = static_cast<int>(s - seqs_begin);
+            return -1;
+          }
+
+        // Last element in sequence.
+        const value_type& v = *((*s).second - 1);
+        if (comp(v, min))      //strictly smaller
+          {
+            min = v;
+            min_sequence = static_cast<int>(s - seqs_begin);
+          }
       }
 
     difference_type overhang_size = 0;
@@ -304,93 +316,108 @@ namespace __gnu_parallel
     int s = 0;
     for (s = 0; s <= min_sequence; s++)
       {
-       RandomAccessIterator1 split;
-       if (stable)
-         split = std::upper_bound(seqs_begin[s].first, seqs_begin[s].second,
-                                  min, comp);
-       else
-         split = std::lower_bound(seqs_begin[s].first, seqs_begin[s].second,
-                                  min, comp);
-
-       overhang_size += seqs_begin[s].second - split;
+        RandomAccessIterator1 split;
+        if (stable)
+          split = std::upper_bound(seqs_begin[s].first, seqs_begin[s].second,
+                                  min, comp);
+        else
+          split = std::lower_bound(seqs_begin[s].first, seqs_begin[s].second,
+                                  min, comp);
+
+        overhang_size += seqs_begin[s].second - split;
       }
 
     for (; s < (seqs_end - seqs_begin); s++)
       {
-       RandomAccessIterator1 split = std::lower_bound(seqs_begin[s].first, seqs_begin[s].second, min, comp);
-       overhang_size += seqs_begin[s].second - split;
+        RandomAccessIterator1 split = std::lower_bound(
+            seqs_begin[s].first, seqs_begin[s].second, min, comp);
+        overhang_size += seqs_begin[s].second - split;
       }
 
     // So many elements will be left over afterwards.
     return overhang_size;
   }
 
-  /** Prepare a set of sequences to be merged with a (end) guard (sentinel)
-   *  @param seqs_begin
-   *  @param seqs_end
-   *  @param comp */
-  template<typename RandomAccessIteratorIterator, typename Comparator>
-  typename std::iterator_traits<typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type>::difference_type
+/** Prepare a set of sequences to be merged with a (end) guard (sentinel)
+ *  @param seqs_begin
+ *  @param seqs_end
+ *  @param comp */
+template<typename RandomAccessIteratorIterator, typename Comparator>
+  typename std::iterator_traits<typename std::iterator_traits<
+      RandomAccessIteratorIterator>::value_type::first_type>::difference_type
   prepare_unguarded_sentinel(RandomAccessIteratorIterator seqs_begin,
-                            RandomAccessIteratorIterator seqs_end,
-                            Comparator comp)
+                            RandomAccessIteratorIterator seqs_end,
+                            Comparator comp)
   {
     _GLIBCXX_CALL(seqs_end - seqs_begin)
 
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
-    typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
+    typedef typename std::iterator_traits<RandomAccessIterator1>
+      ::value_type
       value_type;
-    typedef typename std::iterator_traits<RandomAccessIterator1>::difference_type
+    typedef typename std::iterator_traits<RandomAccessIterator1>
+      ::difference_type
       difference_type;
 
     // Last element in sequence.
     value_type* max = NULL;
     for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++)
       {
-       if ((*s).first == (*s).second)
-         continue;
+        if ((*s).first == (*s).second)
+          continue;
 
-       // Last element in sequence.
-       value_type& v = *((*s).second - 1);
+        // Last element in sequence.
+        value_type& v = *((*s).second - 1);
 
-       // Strictly greater.
-       if (!max || comp(*max, v))
-         max = &v;
+        // Strictly greater.
+        if (!max || comp(*max, v))
+          max = &v;
       }
 
     difference_type overhang_size = 0;
     for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++)
       {
-       RandomAccessIterator1 split = std::lower_bound((*s).first, (*s).second,
-                                                      *max, comp);
-       overhang_size += (*s).second - split;
+        RandomAccessIterator1 split =
+            std::lower_bound((*s).first, (*s).second, *max, comp);
+        overhang_size += (*s).second - split;
 
-       // Set sentinel.
-       *((*s).second) = *max;
+        // Set sentinel.
+        *((*s).second) = *max;
       }
 
     // So many elements will be left over afterwards.
     return overhang_size;
   }
 
-  /** @brief Highly efficient 3-way merging procedure.
-   *  @param seqs_begin Begin iterator of iterator pair input sequence.
-   *  @param seqs_end End iterator of iterator pair input sequence.
-   *  @param target Begin iterator out output sequence.
-   *  @param comp Comparator.
-   *  @param length Maximum length to merge.
-   *  @param stable Unused, stable anyway.
-   *  @return End iterator of output sequence. */
-  template<template<typename RAI, typename C> class iterator, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+/** @brief Highly efficient 3-way merging procedure.
+ *  @param seqs_begin Begin iterator of iterator pair input sequence.
+ *  @param seqs_end End iterator of iterator pair input sequence.
+ *  @param target Begin iterator out output sequence.
+ *  @param comp Comparator.
+ *  @param length Maximum length to merge.
+ *  @param stable Unused, stable anyway.
+ *  @return End iterator of output sequence. */
+template<
+    template<typename RAI, typename C> class iterator,
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  multiway_merge_3_variant(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_3_variant(
+      RandomAccessIteratorIterator seqs_begin,
+      RandomAccessIteratorIterator seqs_end,
+      RandomAccessIterator3 target,
+      Comparator comp, _DifferenceTp length, bool stable)
   {
     _GLIBCXX_CALL(length);
-    
+
     typedef _DifferenceTp difference_type;
 
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
@@ -405,25 +432,25 @@ namespace __gnu_parallel
 
     if (seq0 <= seq1)
       {
-       if (seq1 <= seq2)
-         goto s012;
-       else
-         if (seq2 <  seq0)
-           goto s201;
-         else
-           goto s021;
+        if (seq1 <= seq2)
+          goto s012;
+        else
+          if (seq2 <  seq0)
+            goto s201;
+          else
+            goto s021;
       }
     else
       {
-       if (seq1 <= seq2)
-         {
-           if (seq0 <= seq2)
-             goto s102;
-           else
-             goto s120;
-         }
-       else
-         goto s210;
+        if (seq1 <= seq2)
+          {
+            if (seq0 <= seq2)
+              goto s102;
+            else
+              goto s120;
+          }
+        else
+          goto s210;
       }
 
 #define Merge3Case(a,b,c,c0,c1)                                \
@@ -456,14 +483,23 @@ namespace __gnu_parallel
     return target;
   }
 
-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  multiway_merge_3_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_3_combined(RandomAccessIteratorIterator seqs_begin,
+                            RandomAccessIteratorIterator seqs_end,
+                            RandomAccessIterator3 target,
+                            Comparator comp,
+                            _DifferenceTp length, bool stable)
   {
     _GLIBCXX_CALL(length);
-    
+
     typedef _DifferenceTp difference_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
@@ -472,7 +508,8 @@ namespace __gnu_parallel
     RandomAccessIterator3 target_end;
 
     // Stable anyway.
-    difference_type overhang = prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
+    difference_type overhang =
+        prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
 
     difference_type total_length = 0;
     for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
@@ -480,16 +517,17 @@ namespace __gnu_parallel
 
     if (overhang != -1)
       {
-       difference_type unguarded_length = std::min(length, total_length - overhang);
-       target_end = multiway_merge_3_variant<unguarded_iterator>
-         (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
-       overhang = length - unguarded_length;
+        difference_type unguarded_length =
+            std::min(length, total_length - overhang);
+        target_end = multiway_merge_3_variant<unguarded_iterator>
+          (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
+        overhang = length - unguarded_length;
       }
     else
       {
-       // Empty sequence found.
-       overhang = length;
-       target_end = target;
+        // Empty sequence found.
+        overhang = length;
+        target_end = target;
       }
 
 #if _GLIBCXX_ASSERTIONS
@@ -500,23 +538,23 @@ namespace __gnu_parallel
     switch (min_seq)
       {
       case 0:
-       // Iterators will be advanced accordingly.
-       target_end = merge_advance(seqs_begin[1].first, seqs_begin[1].second,
-                                  seqs_begin[2].first, seqs_begin[2].second,
-                                  target_end, overhang, comp);
-       break;
+        // Iterators will be advanced accordingly.
+        target_end = merge_advance(seqs_begin[1].first, seqs_begin[1].second,
+                                  seqs_begin[2].first, seqs_begin[2].second,
+                                  target_end, overhang, comp);
+        break;
       case 1:
-       target_end = merge_advance(seqs_begin[0].first, seqs_begin[0].second,
-                                  seqs_begin[2].first, seqs_begin[2].second,
-                                  target_end, overhang, comp);
-       break;
+        target_end = merge_advance(seqs_begin[0].first, seqs_begin[0].second,
+                                  seqs_begin[2].first, seqs_begin[2].second,
+                                  target_end, overhang, comp);
+        break;
       case 2:
-       target_end = merge_advance(seqs_begin[0].first, seqs_begin[0].second,
-                                  seqs_begin[1].first, seqs_begin[1].second,
-                                  target_end, overhang, comp);
-       break;
+        target_end = merge_advance(seqs_begin[0].first, seqs_begin[0].second,
+                                  seqs_begin[1].first, seqs_begin[1].second,
+                                  target_end, overhang, comp);
+        break;
       default:
-       _GLIBCXX_PARALLEL_ASSERT(false);
+        _GLIBCXX_PARALLEL_ASSERT(false);
       }
 
 #if _GLIBCXX_ASSERTIONS
@@ -527,22 +565,31 @@ namespace __gnu_parallel
     return target_end;
   }
 
-  /** @brief Highly efficient 4-way merging procedure.
-   *  @param seqs_begin Begin iterator of iterator pair input sequence.
-   *  @param seqs_end End iterator of iterator pair input sequence.
-   *  @param target Begin iterator out output sequence.
-   *  @param comp Comparator.
-   *  @param length Maximum length to merge.
-   *  @param stable Unused, stable anyway.
-   *  @return End iterator of output sequence. */
-  template<template<typename RAI, typename C> class iterator, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+/** @brief Highly efficient 4-way merging procedure.
+ *  @param seqs_begin Begin iterator of iterator pair input sequence.
+ *  @param seqs_end End iterator of iterator pair input sequence.
+ *  @param target Begin iterator out output sequence.
+ *  @param comp Comparator.
+ *  @param length Maximum length to merge.
+ *  @param stable Unused, stable anyway.
+ *  @return End iterator of output sequence. */
+template<
+    template<typename RAI, typename C> class iterator,
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  multiway_merge_4_variant(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_4_variant(RandomAccessIteratorIterator seqs_begin,
+                           RandomAccessIteratorIterator seqs_end,
+                           RandomAccessIterator3 target,
+                           Comparator comp, _DifferenceTp length, bool stable)
   {
     _GLIBCXX_CALL(length);
     typedef _DifferenceTp difference_type;
 
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
@@ -561,26 +608,26 @@ namespace __gnu_parallel
 
     if (seq0 <= seq1)
       {
-       if (seq1 <= seq2)
-         Decision(0,1,2,3)
-         else
-           if (seq2 < seq0)
-             Decision(2,0,1,3)
-             else
-               Decision(0,2,1,3)
-                 }
+        if (seq1 <= seq2)
+          Decision(0,1,2,3)
+          else
+            if (seq2 < seq0)
+              Decision(2,0,1,3)
+              else
+                Decision(0,2,1,3)
+                  }
     else
       {
-       if (seq1 <= seq2)
-         {
-           if (seq0 <= seq2)
-             Decision(1,0,2,3)
-             else
-               Decision(1,2,0,3)
-                 }
-       else
-         Decision(2,1,0,3)
-           }
+        if (seq1 <= seq2)
+          {
+            if (seq0 <= seq2)
+              Decision(1,0,2,3)
+              else
+                Decision(1,2,0,3)
+                  }
+        else
+          Decision(2,1,0,3)
+            }
 
 #define Merge4Case(a,b,c,d,c0,c1,c2)                           \
     s ## a ## b ## c ## d:                                     \
@@ -633,14 +680,23 @@ namespace __gnu_parallel
     return target;
   }
 
-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  multiway_merge_4_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_4_combined(RandomAccessIteratorIterator seqs_begin,
+                            RandomAccessIteratorIterator seqs_end,
+                            RandomAccessIterator3 target,
+                            Comparator comp,
+                            _DifferenceTp length, bool stable)
   {
     _GLIBCXX_CALL(length);
     typedef _DifferenceTp difference_type;
 
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
@@ -649,7 +705,8 @@ namespace __gnu_parallel
     RandomAccessIterator3 target_end;
 
     // Stable anyway.
-    difference_type overhang = prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
+    difference_type overhang =
+        prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
 
     difference_type total_length = 0;
     for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
@@ -657,16 +714,17 @@ namespace __gnu_parallel
 
     if (overhang != -1)
       {
-       difference_type unguarded_length = std::min(length, total_length - overhang);
-       target_end = multiway_merge_4_variant<unguarded_iterator>
-         (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
-       overhang = length - unguarded_length;
+        difference_type unguarded_length =
+            std::min(length, total_length - overhang);
+        target_end = multiway_merge_4_variant<unguarded_iterator>
+          (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
+        overhang = length - unguarded_length;
       }
     else
       {
-       // Empty sequence found.
-       overhang = length;
-       target_end = target;
+        // Empty sequence found.
+        overhang = length;
+        target_end = target;
       }
 
 #if _GLIBCXX_ASSERTIONS
@@ -674,10 +732,13 @@ namespace __gnu_parallel
     _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target_end, comp));
 #endif
 
-    std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> > one_missing(seqs_begin, seqs_end);
+    std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> >
+        one_missing(seqs_begin, seqs_end);
     one_missing.erase(one_missing.begin() + min_seq);  //remove
 
-    target_end = multiway_merge_3_variant<guarded_iterator>(one_missing.begin(), one_missing.end(), target_end, comp, overhang, stable);
+    target_end = multiway_merge_3_variant<guarded_iterator>(
+        one_missing.begin(), one_missing.end(),
+        target_end, comp, overhang, stable);
 
     // Insert back again.
     one_missing.insert(one_missing.begin() + min_seq, seqs_begin[min_seq]);
@@ -692,26 +753,34 @@ namespace __gnu_parallel
     return target_end;
   }
 
-  /** @brief Basic multi-way merging procedure.
-   *
-   *  The head elements are kept in a sorted array, new heads are
-   *  inserted linearly.
-   *  @param seqs_begin Begin iterator of iterator pair input sequence.
-   *  @param seqs_end End iterator of iterator pair input sequence.
-   *  @param target Begin iterator out output sequence.
-   *  @param comp Comparator.
-   *  @param length Maximum length to merge.
-   *  @param stable Stable merging incurs a performance penalty.
-   *  @return End iterator of output sequence. 
-   */
-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+/** @brief Basic multi-way merging procedure.
+ *
+ *  The head elements are kept in a sorted array, new heads are
+ *  inserted linearly.
+ *  @param seqs_begin Begin iterator of iterator pair input sequence.
+ *  @param seqs_end End iterator of iterator pair input sequence.
+ *  @param target Begin iterator out output sequence.
+ *  @param comp Comparator.
+ *  @param length Maximum length to merge.
+ *  @param stable Stable merging incurs a performance penalty.
+ *  @return End iterator of output sequence.
+ */
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  multiway_merge_bubble(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_bubble(RandomAccessIteratorIterator seqs_begin,
+                        RandomAccessIteratorIterator seqs_end,
+                        RandomAccessIterator3 target,
+                        Comparator comp, _DifferenceTp length, bool stable)
   {
     _GLIBCXX_CALL(length)
 
     typedef _DifferenceTp difference_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
@@ -719,7 +788,8 @@ namespace __gnu_parallel
     // Num remaining pieces.
     int k = static_cast<int>(seqs_end - seqs_begin), nrp;
 
-    value_type* pl = static_cast<value_type*>(::operator new(sizeof(value_type) * k));
+    value_type* pl = static_cast<value_type*>(
+      ::operator new(sizeof(value_type) * k));
     int* source = new int[k];
     difference_type total_length = 0;
 
@@ -730,137 +800,138 @@ namespace __gnu_parallel
     nrp = 0;
     for (int pi = 0; pi < k; pi++)
       {
-       if (STOPS(pi) != POS(pi))
-         {
-           pl[nrp] = *(POS(pi));
-           source[nrp] = pi;
-           nrp++;
-           total_length += LENGTH(seqs_begin[pi]);
-         }
+        if (STOPS(pi) != POS(pi))
+          {
+            pl[nrp] = *(POS(pi));
+            source[nrp] = pi;
+            nrp++;
+            total_length += LENGTH(seqs_begin[pi]);
+          }
       }
 
     if (stable)
       {
-       for (int k = 0; k < nrp - 1; k++)
-         for (int pi = nrp - 1; pi > k; pi--)
-           if (comp(pl[pi], pl[pi - 1]) ||
-               (!comp(pl[pi - 1], pl[pi]) && source[pi] < source[pi - 1]))
-             {
-               std::swap(pl[pi - 1], pl[pi]);
-               std::swap(source[pi - 1], source[pi]);
-             }
+        for (int k = 0; k < nrp - 1; k++)
+          for (int pi = nrp - 1; pi > k; pi--)
+            if (comp(pl[pi], pl[pi - 1]) ||
+                (!comp(pl[pi - 1], pl[pi]) && source[pi] < source[pi - 1]))
+              {
+                std::swap(pl[pi - 1], pl[pi]);
+                std::swap(source[pi - 1], source[pi]);
+              }
       }
     else
       {
-       for (int k = 0; k < nrp - 1; k++)
-         for (int pi = nrp - 1; pi > k; pi--)
-           if (comp(pl[pi], pl[pi-1]))
-             {
-               std::swap(pl[pi-1], pl[pi]);
-               std::swap(source[pi-1], source[pi]);
-             }
+        for (int k = 0; k < nrp - 1; k++)
+          for (int pi = nrp - 1; pi > k; pi--)
+            if (comp(pl[pi], pl[pi-1]))
+              {
+                std::swap(pl[pi-1], pl[pi]);
+                std::swap(source[pi-1], source[pi]);
+              }
       }
 
     // Iterate.
     if (stable)
       {
-       int j;
-       while (nrp > 0 && length > 0)
-         {
-           if (source[0] < source[1])
-             {
-               // pl[0] <= pl[1]
-               while ((nrp == 1 || !(comp(pl[1], pl[0]))) && length > 0)
-                 {
-                   *target = pl[0];
-                   ++target;
-                   ++POS(source[0]);
-                   length--;
-                   if (POS(source[0]) == STOPS(source[0]))
-                     {
-                       // Move everything to the left.
-                       for (int s = 0; s < nrp - 1; s++)
-                         {
-                           pl[s] = pl[s + 1];
-                           source[s] = source[s + 1];
-                         }
-                       nrp--;
-                       break;
-                     }
-                   else
-                     pl[0] = *(POS(source[0]));
-                 }
-             }
-           else
-             {
-               // pl[0] < pl[1]
-               while ((nrp == 1 || comp(pl[0], pl[1])) && length > 0)
-                 {
-                   *target = pl[0];
-                   ++target;
-                   ++POS(source[0]);
-                   length--;
-                   if (POS(source[0]) == STOPS(source[0]))
-                     {
-                       for (int s = 0; s < nrp - 1; s++)
-                         {
-                           pl[s] = pl[s + 1];
-                           source[s] = source[s + 1];
-                         }
-                       nrp--;
-                       break;
-                     }
-                   else
-                     pl[0] = *(POS(source[0]));
-                 }
-             }
-
-           // Sink down.
-           j = 1;
-           while ((j < nrp) && (comp(pl[j], pl[j - 1]) ||
-                                (!comp(pl[j - 1], pl[j]) && (source[j] < source[j - 1]))))
-             {
-               std::swap(pl[j - 1], pl[j]);
-               std::swap(source[j - 1], source[j]);
-               j++;
-             }
-         }
+        int j;
+        while (nrp > 0 && length > 0)
+          {
+            if (source[0] < source[1])
+              {
+                // pl[0] <= pl[1]
+                while ((nrp == 1 || !(comp(pl[1], pl[0]))) && length > 0)
+                  {
+                    *target = pl[0];
+                    ++target;
+                    ++POS(source[0]);
+                    length--;
+                    if (POS(source[0]) == STOPS(source[0]))
+                      {
+                        // Move everything to the left.
+                        for (int s = 0; s < nrp - 1; s++)
+                          {
+                            pl[s] = pl[s + 1];
+                            source[s] = source[s + 1];
+                          }
+                        nrp--;
+                        break;
+                      }
+                    else
+                      pl[0] = *(POS(source[0]));
+                  }
+              }
+            else
+              {
+                // pl[0] < pl[1]
+                while ((nrp == 1 || comp(pl[0], pl[1])) && length > 0)
+                  {
+                    *target = pl[0];
+                    ++target;
+                    ++POS(source[0]);
+                    length--;
+                    if (POS(source[0]) == STOPS(source[0]))
+                      {
+                        for (int s = 0; s < nrp - 1; s++)
+                          {
+                            pl[s] = pl[s + 1];
+                            source[s] = source[s + 1];
+                          }
+                        nrp--;
+                        break;
+                      }
+                    else
+                      pl[0] = *(POS(source[0]));
+                  }
+              }
+
+            // Sink down.
+            j = 1;
+            while ((j < nrp) && (comp(pl[j], pl[j - 1]) ||
+                                (!comp(pl[j - 1], pl[j])
+                                    && (source[j] < source[j - 1]))))
+              {
+                std::swap(pl[j - 1], pl[j]);
+                std::swap(source[j - 1], source[j]);
+                j++;
+              }
+          }
       }
     else
       {
-       int j;
-       while (nrp > 0 && length > 0)
-         {
-           // pl[0] <= pl[1]
-           while (nrp == 1 || (!comp(pl[1], pl[0])) && length > 0)
-             {
-               *target = pl[0];
-               ++target;
-               ++POS(source[0]);
-               length--;
-               if (POS(source[0]) == STOPS(source[0]))
-                 {
-                   for (int s = 0; s < (nrp - 1); s++)
-                     {
-                       pl[s] = pl[s + 1];
-                       source[s] = source[s + 1];
-                     }
-                   nrp--;
-                   break;
-                 }
-               else
-                 pl[0] = *(POS(source[0]));
-             }
-
-           // Sink down.
-           j = 1;
-           while ((j < nrp) && comp(pl[j], pl[j - 1]))
-             {
-               std::swap(pl[j - 1], pl[j]);
-               std::swap(source[j - 1], source[j]);
-               j++;
-             }
-         }
+        int j;
+        while (nrp > 0 && length > 0)
+          {
+            // pl[0] <= pl[1]
+            while (nrp == 1 || (!comp(pl[1], pl[0])) && length > 0)
+              {
+                *target = pl[0];
+                ++target;
+                ++POS(source[0]);
+                length--;
+                if (POS(source[0]) == STOPS(source[0]))
+                  {
+                    for (int s = 0; s < (nrp - 1); s++)
+                      {
+                        pl[s] = pl[s + 1];
+                        source[s] = source[s + 1];
+                      }
+                    nrp--;
+                    break;
+                  }
+                else
+                  pl[0] = *(POS(source[0]));
+              }
+
+            // Sink down.
+            j = 1;
+            while ((j < nrp) && comp(pl[j], pl[j - 1]))
+              {
+                std::swap(pl[j - 1], pl[j]);
+                std::swap(source[j - 1], source[j]);
+                j++;
+              }
+          }
       }
 
     delete[] pl;
@@ -869,26 +940,36 @@ namespace __gnu_parallel
     return target;
   }
 
-  /** @brief Multi-way merging procedure for a high branching factor,
-   * guarded case.
-   *
-   *  The head elements are kept in a loser tree.
-   *  @param seqs_begin Begin iterator of iterator pair input sequence.
-   *  @param seqs_end End iterator of iterator pair input sequence.
-   *  @param target Begin iterator out output sequence.
-   *  @param comp Comparator.
-   *  @param length Maximum length to merge.
-   *  @param stable Stable merging incurs a performance penalty.
-   *  @return End iterator of output sequence. 
-   */
-  template<typename LT, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+/** @brief Multi-way merging procedure for a high branching factor,
+ * guarded case.
+ *
+ *  The head elements are kept in a loser tree.
+ *  @param seqs_begin Begin iterator of iterator pair input sequence.
+ *  @param seqs_end End iterator of iterator pair input sequence.
+ *  @param target Begin iterator out output sequence.
+ *  @param comp Comparator.
+ *  @param length Maximum length to merge.
+ *   @param stable Stable merging incurs a performance penalty.
+ *  @return End iterator of output sequence.
+ */
+template<
+    typename LT,
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  multiway_merge_loser_tree(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_loser_tree(RandomAccessIteratorIterator seqs_begin,
+                            RandomAccessIteratorIterator seqs_end,
+                            RandomAccessIterator3 target,
+                            Comparator comp,
+                            _DifferenceTp length, bool stable)
   {
     _GLIBCXX_CALL(length)
 
     typedef _DifferenceTp difference_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
@@ -941,64 +1022,73 @@ namespace __gnu_parallel
 
     if (stable)
       {
-       for (difference_type i = 0; i < total_length; i++)
-         {
-           // Take out.
-           source = lt.get_min_source();
+        for (difference_type i = 0; i < total_length; i++)
+          {
+            // Take out.
+            source = lt.get_min_source();
 
-           *(target++) = *(seqs_begin[source].first++);
+            *(target++) = *(seqs_begin[source].first++);
 
-           // Feed.
-           if (seqs_begin[source].first == seqs_begin[source].second)
-             lt.delete_min_insert_stable(*arbitrary_element, true);
-           else
-             // Replace from same source.
-             lt.delete_min_insert_stable(*seqs_begin[source].first, false);
+            // Feed.
+            if (seqs_begin[source].first == seqs_begin[source].second)
+              lt.delete_min_insert_stable(*arbitrary_element, true);
+            else
+              // Replace from same source.
+              lt.delete_min_insert_stable(*seqs_begin[source].first, false);
 
-         }
+          }
       }
     else
       {
-       for (difference_type i = 0; i < total_length; i++)
-         {
-           //take out
-           source = lt.get_min_source();
-
-           *(target++) = *(seqs_begin[source].first++);
-
-           // Feed.
-           if (seqs_begin[source].first == seqs_begin[source].second)
-             lt.delete_min_insert(*arbitrary_element, true);
-           else
-             // Replace from same source.
-             lt.delete_min_insert(*seqs_begin[source].first, false);
-         }
+        for (difference_type i = 0; i < total_length; i++)
+          {
+            //take out
+            source = lt.get_min_source();
+
+            *(target++) = *(seqs_begin[source].first++);
+
+            // Feed.
+            if (seqs_begin[source].first == seqs_begin[source].second)
+              lt.delete_min_insert(*arbitrary_element, true);
+            else
+              // Replace from same source.
+              lt.delete_min_insert(*seqs_begin[source].first, false);
+          }
       }
 
     return target;
   }
 
-  /** @brief Multi-way merging procedure for a high branching factor,
-   * unguarded case.
-   *
-   *  The head elements are kept in a loser tree.
-   *  @param seqs_begin Begin iterator of iterator pair input sequence.
-   *  @param seqs_end End iterator of iterator pair input sequence.
-   *  @param target Begin iterator out output sequence.
-   *  @param comp Comparator.
-   *  @param length Maximum length to merge.
-   *  @param stable Stable merging incurs a performance penalty.
-   *  @return End iterator of output sequence.
-   *  @pre No input will run out of elements during the merge. 
-   */
-  template<typename LT, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+/** @brief Multi-way merging procedure for a high branching factor,
+ * unguarded case.
+ *
+ *  The head elements are kept in a loser tree.
+ *  @param seqs_begin Begin iterator of iterator pair input sequence.
+ *  @param seqs_end End iterator of iterator pair input sequence.
+ *  @param target Begin iterator out output sequence.
+ *  @param comp Comparator.
+ *  @param length Maximum length to merge.
+ *  @param stable Stable merging incurs a performance penalty.
+ *  @return End iterator of output sequence.
+ *  @pre No input will run out of elements during the merge.
+ */
+template<
+    typename LT,
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp, typename Comparator>
   RandomAccessIterator3
-  multiway_merge_loser_tree_unguarded(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_loser_tree_unguarded(RandomAccessIteratorIterator seqs_begin,
+                                      RandomAccessIteratorIterator seqs_end,
+                                      RandomAccessIterator3 target,
+                                      Comparator comp,
+                                      _DifferenceTp length, bool stable)
   {
     _GLIBCXX_CALL(length)
     typedef _DifferenceTp difference_type;
 
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
@@ -1012,14 +1102,14 @@ namespace __gnu_parallel
     for (int t = 0; t < k; t++)
       {
 #if _GLIBCXX_ASSERTIONS
-       _GLIBCXX_PARALLEL_ASSERT(seqs_begin[t].first != seqs_begin[t].second);
+        _GLIBCXX_PARALLEL_ASSERT(seqs_begin[t].first != seqs_begin[t].second);
 #endif
-       if (stable)
-         lt.insert_start_stable(*seqs_begin[t].first, t, false);
-       else
-         lt.insert_start(*seqs_begin[t].first, t, false);
+        if (stable)
+          lt.insert_start_stable(*seqs_begin[t].first, t, false);
+        else
+          lt.insert_start(*seqs_begin[t].first, t, false);
 
-       total_length += LENGTH(seqs_begin[t]);
+        total_length += LENGTH(seqs_begin[t]);
       }
 
     if (stable)
@@ -1038,68 +1128,84 @@ namespace __gnu_parallel
 
     if (stable)
       {
-       RandomAccessIterator3 target_end = target + length;
-       while (target < target_end)
-         {
-           // Take out.
-           source = lt.get_min_source();
+        RandomAccessIterator3 target_end = target + length;
+        while (target < target_end)
+          {
+            // Take out.
+            source = lt.get_min_source();
 
 #if _GLIBCXX_ASSERTIONS
-           _GLIBCXX_PARALLEL_ASSERT(i == 0 || !comp(*(seqs_begin[source].first), *(target - 1)));
+            _GLIBCXX_PARALLEL_ASSERT(i == 0
+                || !comp(*(seqs_begin[source].first), *(target - 1)));
 #endif
 
-           *(target++) = *(seqs_begin[source].first++);
+            *(target++) = *(seqs_begin[source].first++);
 
 #if _GLIBCXX_ASSERTIONS
-           _GLIBCXX_PARALLEL_ASSERT((seqs_begin[source].first != seqs_begin[source].second) || (i == length - 1));
-           i++;
+            _GLIBCXX_PARALLEL_ASSERT(
+                (seqs_begin[source].first != seqs_begin[source].second)
+                || (i == length - 1));
+            i++;
 #endif
-           // Feed.
-           // Replace from same source.
-           lt.delete_min_insert_stable(*seqs_begin[source].first, false);
+            // Feed.
+            // Replace from same source.
+            lt.delete_min_insert_stable(*seqs_begin[source].first, false);
 
-         }
+          }
       }
     else
       {
-       RandomAccessIterator3 target_end = target + length;
-       while (target < target_end)
-         {
-           // Take out.
-           source = lt.get_min_source();
+        RandomAccessIterator3 target_end = target + length;
+        while (target < target_end)
+          {
+            // Take out.
+            source = lt.get_min_source();
 
 #if _GLIBCXX_ASSERTIONS
-           if (i > 0 && comp(*(seqs_begin[source].first), *(target - 1)))
-             printf("         %i %i %i\n", length, i, source);
-           _GLIBCXX_PARALLEL_ASSERT(i == 0 || !comp(*(seqs_begin[source].first), *(target - 1)));
+            if (i > 0 && comp(*(seqs_begin[source].first), *(target - 1)))
+              printf("         %i %i %i\n", length, i, source);
+            _GLIBCXX_PARALLEL_ASSERT(i == 0
+                || !comp(*(seqs_begin[source].first), *(target - 1)));
 #endif
 
-           *(target++) = *(seqs_begin[source].first++);
+            *(target++) = *(seqs_begin[source].first++);
 
 #if _GLIBCXX_ASSERTIONS
-           if (!((seqs_begin[source].first != seqs_begin[source].second) || (i >= length - 1)))
-             printf("         %i %i %i\n", length, i, source);
-           _GLIBCXX_PARALLEL_ASSERT((seqs_begin[source].first != seqs_begin[source].second) || (i >= length - 1));
-           i++;
+            if (!((seqs_begin[source].first != seqs_begin[source].second)
+                || (i >= length - 1)))
+              printf("         %i %i %i\n", length, i, source);
+            _GLIBCXX_PARALLEL_ASSERT(
+                (seqs_begin[source].first != seqs_begin[source].second)
+                || (i >= length - 1));
+            i++;
 #endif
-           // Feed.
-           // Replace from same source.
-           lt.delete_min_insert(*seqs_begin[source].first, false);
-         }
+            // Feed.
+            // Replace from same source.
+            lt.delete_min_insert(*seqs_begin[source].first, false);
+          }
       }
 
     return target;
   }
 
-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  multiway_merge_loser_tree_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_loser_tree_combined(RandomAccessIteratorIterator seqs_begin,
+                                     RandomAccessIteratorIterator seqs_end,
+                                     RandomAccessIterator3 target,
+                                     Comparator comp,
+                                     _DifferenceTp length, bool stable)
   {
     _GLIBCXX_CALL(length)
 
     typedef _DifferenceTp difference_type;
 
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
@@ -1107,7 +1213,7 @@ namespace __gnu_parallel
     int min_seq;
     RandomAccessIterator3 target_end;
     difference_type overhang = prepare_unguarded(seqs_begin, seqs_end,
-                                         comp, min_seq, stable);
+                                          comp, min_seq, stable);
 
     difference_type total_length = 0;
     for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++)
@@ -1115,17 +1221,18 @@ namespace __gnu_parallel
 
     if (overhang != -1)
       {
-       difference_type unguarded_length = std::min(length, total_length - overhang);
-       target_end = multiway_merge_loser_tree_unguarded
-         <typename loser_tree_unguarded_traits<value_type, Comparator>::LT>
-         (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
-       overhang = length - unguarded_length;
+        difference_type unguarded_length =
+            std::min(length, total_length - overhang);
+        target_end = multiway_merge_loser_tree_unguarded
+          <typename loser_tree_unguarded_traits<value_type, Comparator>::LT>
+          (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
+        overhang = length - unguarded_length;
       }
     else
       {
-       // Empty sequence found.
-       overhang = length;
-       target_end = target;
+        // Empty sequence found.
+        overhang = length;
+        target_end = target;
       }
 
 #if _GLIBCXX_ASSERTIONS
@@ -1145,34 +1252,43 @@ namespace __gnu_parallel
     return target_end;
   }
 
-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  multiway_merge_loser_tree_sentinel(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_loser_tree_sentinel(RandomAccessIteratorIterator seqs_begin,
+                                     RandomAccessIteratorIterator seqs_end,
+                                      RandomAccessIterator3 target,
+                                      Comparator comp,
+                                      _DifferenceTp length, bool stable)
   {
     _GLIBCXX_CALL(length)
 
     typedef _DifferenceTp difference_type;
     typedef std::iterator_traits<RandomAccessIteratorIterator> traits_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
-      RandomAccessIterator1;
 
     RandomAccessIterator3 target_end;
-    difference_type overhang = prepare_unguarded_sentinel(seqs_begin, seqs_end, comp);
+    difference_type overhang =
+        prepare_unguarded_sentinel(seqs_begin, seqs_end, comp);
 
     difference_type total_length = 0;
     for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++)
       {
-       total_length += LENGTH(*s);
+        total_length += LENGTH(*s);
 
-       // Sentinel spot.
-       (*s).second++;
+        // Sentinel spot.
+        (*s).second++;
       }
 
-    difference_type unguarded_length = std::min(length, total_length - overhang);
+    difference_type unguarded_length =
+        std::min(length, total_length - overhang);
     target_end = multiway_merge_loser_tree_unguarded
       <typename loser_tree_unguarded_traits<value_type, Comparator>::LT>
       (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
@@ -1184,14 +1300,17 @@ namespace __gnu_parallel
 #endif
 
     // Copy rest stable.
-    for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end && overhang > 0; s++)
+    for (RandomAccessIteratorIterator s = seqs_begin;
+         s != seqs_end && overhang > 0; s++)
       {
-       // Restore.
-       (*s).second--;
-       difference_type local_length = std::min((difference_type)overhang, (difference_type)LENGTH(*s));
-       target_end = std::copy((*s).first, (*s).first + local_length, target_end);
-       (*s).first += local_length;
-       overhang -= local_length;
+        // Restore.
+        (*s).second--;
+        difference_type local_length =
+            std::min<difference_type>(overhang, LENGTH(*s));
+        target_end = std::copy((*s).first, (*s).first + local_length,
+                               target_end);
+        (*s).first += local_length;
+        overhang -= local_length;
       }
 
 #if _GLIBCXX_ASSERTIONS
@@ -1203,25 +1322,35 @@ namespace __gnu_parallel
     return target_end;
   }
 
-  /** @brief Sequential multi-way merging switch.
-   *
-   *  The decision if based on the branching factor and runtime settings.
-   *  @param seqs_begin Begin iterator of iterator pair input sequence.
-   *  @param seqs_end End iterator of iterator pair input sequence.
-   *  @param target Begin iterator out output sequence.
-   *  @param comp Comparator.
-   *  @param length Maximum length to merge.
-   *  @param stable Stable merging incurs a performance penalty.
-   *  @param sentinel The sequences have a sentinel element.
-   *  @return End iterator of output sequence. */
-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+/** @brief Sequential multi-way merging switch.
+ *
+ *  The decision if based on the branching factor and runtime settings.
+ *  @param seqs_begin Begin iterator of iterator pair input sequence.
+ *  @param seqs_end End iterator of iterator pair input sequence.
+ *  @param target Begin iterator out output sequence.
+ *  @param comp Comparator.
+ *  @param length Maximum length to merge.
+ *  @param stable Stable merging incurs a performance penalty.
+ *  @param sentinel The sequences have a sentinel element.
+ *  @return End iterator of output sequence. */
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  multiway_merge(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable, bool sentinel, sequential_tag)
+  multiway_merge(RandomAccessIteratorIterator seqs_begin,
+                 RandomAccessIteratorIterator seqs_end,
+                 RandomAccessIterator3 target,
+                 Comparator comp, _DifferenceTp length,
+                 bool stable, bool sentinel,
+                 sequential_tag)
   {
     _GLIBCXX_CALL(length)
 
     typedef _DifferenceTp difference_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
@@ -1234,7 +1363,8 @@ namespace __gnu_parallel
     RandomAccessIterator3 return_target = target;
     int k = static_cast<int>(seqs_end - seqs_begin);
 
-    Settings::MultiwayMergeAlgorithm mwma = Settings::multiway_merge_algorithm;
+    Settings::MultiwayMergeAlgorithm mwma =
+        Settings::multiway_merge_algorithm;
 
     if (!sentinel && mwma == Settings::LOSER_TREE_SENTINEL)
       mwma = Settings::LOSER_TREE_COMBINED;
@@ -1242,75 +1372,126 @@ namespace __gnu_parallel
     switch (k)
       {
       case 0:
-       break;
+        break;
       case 1:
-       return_target = std::copy(seqs_begin[0].first, seqs_begin[0].first + length, target);
-       seqs_begin[0].first += length;
-       break;
+        return_target = std::copy(seqs_begin[0].first,
+                                  seqs_begin[0].first + length,
+                                  target);
+        seqs_begin[0].first += length;
+        break;
       case 2:
-       return_target = merge_advance(seqs_begin[0].first, seqs_begin[0].second, seqs_begin[1].first, seqs_begin[1].second, target, length, comp);
-       break;
+        return_target = merge_advance(seqs_begin[0].first,
+                                      seqs_begin[0].second,
+                                      seqs_begin[1].first,
+                                      seqs_begin[1].second,
+                                      target, length, comp);
+        break;
       case 3:
-       switch (mwma)
-         {
-         case Settings::LOSER_TREE_COMBINED:
-           return_target = multiway_merge_3_combined(seqs_begin, seqs_end, target, comp, length, stable);
-           break;
-         case Settings::LOSER_TREE_SENTINEL:
-           return_target = multiway_merge_3_variant<unguarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable);
-           break;
-         default:
-           return_target = multiway_merge_3_variant<guarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable);
-           break;
-         }
-       break;
+        switch (mwma)
+          {
+          case Settings::LOSER_TREE_COMBINED:
+            return_target = multiway_merge_3_combined(seqs_begin,
+                seqs_end,
+                target,
+                comp, length, stable);
+            break;
+          case Settings::LOSER_TREE_SENTINEL:
+            return_target = multiway_merge_3_variant<unguarded_iterator>(
+                seqs_begin,
+                seqs_end,
+                target,
+                comp, length, stable);
+            break;
+          default:
+            return_target = multiway_merge_3_variant<guarded_iterator>(
+                seqs_begin,
+                seqs_end,
+                target,
+                comp, length, stable);
+            break;
+          }
+        break;
       case 4:
-       switch (mwma)
-         {
-         case Settings::LOSER_TREE_COMBINED:
-           return_target = multiway_merge_4_combined(seqs_begin, seqs_end, target, comp, length, stable);
-           break;
-         case Settings::LOSER_TREE_SENTINEL:
-           return_target = multiway_merge_4_variant<unguarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable);
-           break;
-         default:
-           return_target = multiway_merge_4_variant<guarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable);
-           break;
-         }
-       break;
+        switch (mwma)
+          {
+          case Settings::LOSER_TREE_COMBINED:
+            return_target = multiway_merge_4_combined(
+                seqs_begin,
+                seqs_end,
+                target,
+                comp, length, stable);
+            break;
+          case Settings::LOSER_TREE_SENTINEL:
+            return_target = multiway_merge_4_variant<unguarded_iterator>(
+                seqs_begin,
+                seqs_end,
+                target,
+                comp, length, stable);
+            break;
+          default:
+            return_target = multiway_merge_4_variant<guarded_iterator>(
+                seqs_begin,
+                seqs_end,
+                target,
+                comp, length, stable);
+            break;
+          }
+        break;
       default:
-       {
-         switch (mwma)
-           {
-           case Settings::BUBBLE:
-             return_target = multiway_merge_bubble(seqs_begin, seqs_end, target, comp, length, stable);
-             break;
+        {
+          switch (mwma)
+            {
+            case Settings::BUBBLE:
+              return_target = multiway_merge_bubble(
+                  seqs_begin,
+                  seqs_end,
+                  target,
+                  comp, length, stable);
+              break;
 #if _GLIBCXX_LOSER_TREE_EXPLICIT
-           case Settings::LOSER_TREE_EXPLICIT:
-             return_target = multiway_merge_loser_tree<LoserTreeExplicit<value_type, Comparator> >(seqs_begin, seqs_end, target, comp, length, stable);
-             break;
+            case Settings::LOSER_TREE_EXPLICIT:
+              return_target = multiway_merge_loser_tree<
+                    LoserTreeExplicit<value_type, Comparator> >(
+                  seqs_begin,
+                  seqs_end,
+                  target,
+                  comp, length, stable);
+              break;
 #endif
 #if _GLIBCXX_LOSER_TREE
-           case Settings::LOSER_TREE:
-             return_target = multiway_merge_loser_tree<LoserTree<value_type, Comparator> >(seqs_begin, seqs_end, target, comp, length, stable);
-             break;
+            case Settings::LOSER_TREE:
+              return_target = multiway_merge_loser_tree<
+                    LoserTree<value_type, Comparator> >(
+                  seqs_begin,
+                  seqs_end,
+                  target,
+                  comp, length, stable);
+              break;
 #endif
 #if _GLIBCXX_LOSER_TREE_COMBINED
-           case Settings::LOSER_TREE_COMBINED:
-             return_target = multiway_merge_loser_tree_combined(seqs_begin, seqs_end, target, comp, length, stable);
-             break;
+            case Settings::LOSER_TREE_COMBINED:
+              return_target = multiway_merge_loser_tree_combined(
+                  seqs_begin,
+                  seqs_end,
+                  target,
+                  comp, length, stable);
+              break;
 #endif
 #if _GLIBCXX_LOSER_TREE_SENTINEL
-           case Settings::LOSER_TREE_SENTINEL:
-             return_target = multiway_merge_loser_tree_sentinel(seqs_begin, seqs_end, target, comp, length, stable);
-             break;
+            case Settings::LOSER_TREE_SENTINEL:
+              return_target = multiway_merge_loser_tree_sentinel(
+                  seqs_begin,
+                  seqs_end,
+                  target,
+                  comp, length, stable);
+              break;
 #endif
-           default:
-             // multiway_merge algorithm not implemented.
-             _GLIBCXX_PARALLEL_ASSERT(0);
-             break;
-           }
-       }
+            default:
+              // multiway_merge algorithm not implemented.
+              _GLIBCXX_PARALLEL_ASSERT(0);
+              break;
+            }
+        }
       }
 #if _GLIBCXX_ASSERTIONS
     _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target + length, comp));
@@ -1319,214 +1500,268 @@ namespace __gnu_parallel
     return return_target;
   }
 
-  /** @brief Parallel multi-way merge routine.
-   *
-   *  The decision if based on the branching factor and runtime settings.
-   *  @param seqs_begin Begin iterator of iterator pair input sequence.
-   *  @param seqs_end End iterator of iterator pair input sequence.
-   *  @param target Begin iterator out output sequence.
-   *  @param comp Comparator.
-   *  @param length Maximum length to merge.
-   *  @param stable Stable merging incurs a performance penalty.
-   *  @param sentinel Ignored.
-   *  @return End iterator of output sequence. 
-   */
-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+/** @brief Parallel multi-way merge routine.
+ *
+ *  The decision if based on the branching factor and runtime settings.
+ *  @param seqs_begin Begin iterator of iterator pair input sequence.
+ *  @param seqs_end End iterator of iterator pair input sequence.
+ *  @param target Begin iterator out output sequence.
+ *  @param comp Comparator.
+ *  @param length Maximum length to merge.
+ *  @param stable Stable merging incurs a performance penalty.
+ *  @param sentinel Ignored.
+ *  @return End iterator of output sequence.
+ */
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  parallel_multiway_merge(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable, bool sentinel)
-  {
-    _GLIBCXX_CALL(length)
-
-    typedef _DifferenceTp difference_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
-      RandomAccessIterator1;
-    typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
-      value_type;
-
-#if _GLIBCXX_ASSERTIONS
-    for (RandomAccessIteratorIterator rii = seqs_begin; rii != seqs_end; rii++)
-      _GLIBCXX_PARALLEL_ASSERT(is_sorted((*rii).first, (*rii).second, comp));
-#endif
-
-    // k sequences.
-    int k = static_cast<int>(seqs_end - seqs_begin);
-
-    difference_type total_length = 0;
-    for (RandomAccessIteratorIterator raii = seqs_begin; raii != seqs_end; raii++)
-      total_length += LENGTH(*raii);
-
-    _GLIBCXX_CALL(total_length)
-
-    if (total_length == 0 || k == 0)
-      return target;
-
-    thread_index_t num_threads = static_cast<thread_index_t>(std::min(static_cast<difference_type>(get_max_threads()), total_length));
-
-    bool tight = (total_length == length);
-
-    // Thread t will have to merge pieces[iam][0..k - 1]
-    std::vector<std::pair<difference_type, difference_type> >* pieces = new std::vector<std::pair<difference_type, difference_type> >[num_threads];
-    for (int s = 0; s < num_threads; s++)
-      pieces[s].resize(k);
-
-    difference_type num_samples = Settings::merge_oversampling * num_threads;
-
-    if (Settings::multiway_merge_splitting == Settings::SAMPLING)
-      {
-       value_type* samples = static_cast<value_type*>(::operator new(sizeof(value_type) * k * num_samples));
-       // Sample.
-       for (int s = 0; s < k; s++)
-         for (int i = 0; (difference_type)i < num_samples; i++)
-           {
-             difference_type sample_index = static_cast<difference_type>(LENGTH(seqs_begin[s]) * (double(i + 1) / (num_samples + 1)) * (double(length) / total_length));
-             samples[s * num_samples + i] = seqs_begin[s].first[sample_index];
-           }
-
-       if (stable)
-         __gnu_sequential::stable_sort(samples, samples + (num_samples * k), comp);
-       else
-         __gnu_sequential::sort(samples, samples + (num_samples * k), comp);
-
-       for (int slab = 0; slab < num_threads; slab++)
-         // For each slab / processor.
-         for (int seq = 0; seq < k; seq++)
-           {
-             // For each sequence.
-             if (slab > 0)
-               pieces[slab][seq].first = std::upper_bound(seqs_begin[seq].first, seqs_begin[seq].second, samples[num_samples * k * slab / num_threads], comp) - seqs_begin[seq].first;
-             else
-               {
-                 // Absolute beginning.
-                 pieces[slab][seq].first = 0;
-               }
-             if ((slab + 1) < num_threads)
-               pieces[slab][seq].second = std::upper_bound(seqs_begin[seq].first, seqs_begin[seq].second, samples[num_samples * k * (slab + 1) / num_threads], comp) - seqs_begin[seq].first;
-             else
-               pieces[slab][seq].second = LENGTH(seqs_begin[seq]);     //absolute ending
-           }
-       delete[] samples;
-      }
-    else
-      {
-       // (Settings::multiway_merge_splitting == Settings::EXACT).
-       std::vector<RandomAccessIterator1>* offsets = new std::vector<RandomAccessIterator1>[num_threads];
-       std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> > se(k);
-
-       copy(seqs_begin, seqs_end, se.begin());
-
-       difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 1)));
-       equally_split(length, num_threads, borders);
-
-       for (int s = 0; s < (num_threads - 1); s++)
-         {
-           offsets[s].resize(k);
-           multiseq_partition(se.begin(), se.end(), borders[s + 1],
-                              offsets[s].begin(), comp);
-
-           // Last one also needed and available.
-           if (!tight)
-             {
-               offsets[num_threads - 1].resize(k);
-               multiseq_partition(se.begin(), se.end(), 
-                                  difference_type(length), 
-                                  offsets[num_threads - 1].begin(),  comp);
-             }
-         }
-
-
-       for (int slab = 0; slab < num_threads; slab++)
-         {
-           // For each slab / processor.
-           for (int seq = 0; seq < k; seq++)
-             {
-               // For each sequence.
-               if (slab == 0)
-                 {
-                   // Absolute beginning.
-                   pieces[slab][seq].first = 0;
-                 }
-               else
-                 pieces[slab][seq].first = pieces[slab - 1][seq].second;
-               if (!tight || slab < (num_threads - 1))
-                 pieces[slab][seq].second = offsets[slab][seq] - seqs_begin[seq].first;
-               else
-                 {
-                   // slab == num_threads - 1
-                   pieces[slab][seq].second = LENGTH(seqs_begin[seq]);
-                 }
-             }
-         }
-       delete[] offsets;
-      }
-
-#      pragma omp parallel num_threads(num_threads)
+  parallel_multiway_merge(RandomAccessIteratorIterator seqs_begin,
+                          RandomAccessIteratorIterator seqs_end,
+                           RandomAccessIterator3 target,
+                           Comparator comp,
+                           _DifferenceTp length, bool stable, bool sentinel)
     {
-      thread_index_t iam = omp_get_thread_num();
-
-      difference_type target_position = 0;
-
-      for (int c = 0; c < k; c++)
-       target_position += pieces[iam][c].first;
-
-      if (k > 2)
-       {
-         std::pair<RandomAccessIterator1, RandomAccessIterator1>* chunks = new std::pair<RandomAccessIterator1, RandomAccessIterator1>[k];
-
-         difference_type local_length = 0;
-         for (int s = 0; s < k; s++)
-           {
-             chunks[s] = std::make_pair(seqs_begin[s].first + pieces[iam][s].first, seqs_begin[s].first + pieces[iam][s].second);
-             local_length += LENGTH(chunks[s]);
-           }
-
-         multiway_merge(chunks, chunks + k, target + target_position, comp,
-                        std::min(local_length, length - target_position),
-                        stable, false, sequential_tag());
-
-         delete[] chunks;
-       }
-      else if (k == 2)
-       {
-         RandomAccessIterator1 begin0 = seqs_begin[0].first + pieces[iam][0].first, begin1 = seqs_begin[1].first + pieces[iam][1].first;
-         merge_advance(begin0,
-                       seqs_begin[0].first + pieces[iam][0].second,
-                       begin1,
-                       seqs_begin[1].first + pieces[iam][1].second,
-                       target + target_position,
-                       (pieces[iam][0].second - pieces[iam][0].first) + (pieces[iam][1].second - pieces[iam][1].first),
-                       comp);
-       }
-    }
+      _GLIBCXX_CALL(length)
+
+      typedef _DifferenceTp difference_type;
+      typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+        ::value_type::first_type
+        RandomAccessIterator1;
+      typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
+        value_type;
+
+      // k sequences.
+      int k = static_cast<int>(seqs_end - seqs_begin);
+
+      difference_type total_length = 0;
+      for (RandomAccessIteratorIterator raii = seqs_begin;
+           raii != seqs_end; raii++)
+        total_length += LENGTH(*raii);
+
+      _GLIBCXX_CALL(total_length)
+
+      if (total_length == 0 || k == 0)
+        return target;
+
+      bool tight = (total_length == length);
+
+      std::vector<std::pair<difference_type, difference_type> >* pieces;
+
+      thread_index_t num_threads = static_cast<thread_index_t>(
+          std::min<difference_type>(get_max_threads(), total_length));
+
+#     pragma omp parallel num_threads (num_threads)
+        {
+#         pragma omp single
+            {
+              num_threads = omp_get_num_threads();
+              // Thread t will have to merge pieces[iam][0..k - 1]
+              pieces = new std::vector<
+                  std::pair<difference_type, difference_type> >[num_threads];
+              for (int s = 0; s < num_threads; s++)
+                pieces[s].resize(k);
+
+              difference_type num_samples =
+                  Settings::merge_oversampling * num_threads;
+
+              if (Settings::multiway_merge_splitting == Settings::SAMPLING)
+                {
+                  value_type* samples = static_cast<value_type*>(
+                    ::operator new(sizeof(value_type) * k * num_samples));
+                  // Sample.
+                  for (int s = 0; s < k; s++)
+                    for (int i = 0; (difference_type)i < num_samples; i++)
+                      {
+                        difference_type sample_index =
+                            static_cast<difference_type>(
+                                LENGTH(seqs_begin[s]) * (double(i + 1) /
+                                (num_samples + 1)) * (double(length)
+                                / total_length));
+                        samples[s * num_samples + i] =
+                            seqs_begin[s].first[sample_index];
+                      }
+
+                  if (stable)
+                    __gnu_sequential::stable_sort(
+                      samples, samples + (num_samples * k), comp);
+                  else
+                    __gnu_sequential::sort(
+                      samples, samples + (num_samples * k), comp);
+
+                  for (int slab = 0; slab < num_threads; slab++)
+                    // For each slab / processor.
+                    for (int seq = 0; seq < k; seq++)
+                      {
+                        // For each sequence.
+                        if (slab > 0)
+                          pieces[slab][seq].first =
+                              std::upper_bound(
+                                seqs_begin[seq].first,
+                                seqs_begin[seq].second,
+                                samples[num_samples * k * slab / num_threads],
+                                  comp)
+                              - seqs_begin[seq].first;
+                        else
+                          {
+                            // Absolute beginning.
+                            pieces[slab][seq].first = 0;
+                          }
+                        if ((slab + 1) < num_threads)
+                          pieces[slab][seq].second =
+                              std::upper_bound(
+                                  seqs_begin[seq].first,
+                                  seqs_begin[seq].second,
+                                  samples[num_samples * k * (slab + 1) /
+                                      num_threads], comp)
+                              - seqs_begin[seq].first;
+                        else
+                        pieces[slab][seq].second = LENGTH(seqs_begin[seq]);
+                      }
+                    delete[] samples;
+                }
+              else
+                {
+                  // (Settings::multiway_merge_splitting == Settings::EXACT).
+                  std::vector<RandomAccessIterator1>* offsets =
+                      new std::vector<RandomAccessIterator1>[num_threads];
+                  std::vector<
+                      std::pair<RandomAccessIterator1, RandomAccessIterator1>
+                      > se(k);
+
+                  copy(seqs_begin, seqs_end, se.begin());
+
+                  difference_type* borders =
+                      new difference_type[num_threads + 1];
+                  equally_split(length, num_threads, borders);
+
+                  for (int s = 0; s < (num_threads - 1); s++)
+                    {
+                      offsets[s].resize(k);
+                      multiseq_partition(
+                          se.begin(), se.end(), borders[s + 1],
+                          offsets[s].begin(), comp);
+
+                      // Last one also needed and available.
+                      if (!tight)
+                        {
+                          offsets[num_threads - 1].resize(k);
+                          multiseq_partition(se.begin(), se.end(),
+                                difference_type(length),
+                                offsets[num_threads - 1].begin(),  comp);
+                        }
+                    }
+
+
+                  for (int slab = 0; slab < num_threads; slab++)
+                    {
+                      // For each slab / processor.
+                      for (int seq = 0; seq < k; seq++)
+                        {
+                          // For each sequence.
+                          if (slab == 0)
+                            {
+                              // Absolute beginning.
+                              pieces[slab][seq].first = 0;
+                            }
+                          else
+                            pieces[slab][seq].first =
+                                pieces[slab - 1][seq].second;
+                          if (!tight || slab < (num_threads - 1))
+                            pieces[slab][seq].second =
+                                offsets[slab][seq] - seqs_begin[seq].first;
+                          else
+                            {
+                              // slab == num_threads - 1
+                              pieces[slab][seq].second =
+                                  LENGTH(seqs_begin[seq]);
+                            }
+                        }
+                    }
+                  delete[] offsets;
+                }
+            } //single
+
+          thread_index_t iam = omp_get_thread_num();
+
+          difference_type target_position = 0;
+
+          for (int c = 0; c < k; c++)
+            target_position += pieces[iam][c].first;
+
+          if (k > 2)
+            {
+              std::pair<RandomAccessIterator1, RandomAccessIterator1>* chunks
+                = new
+                  std::pair<RandomAccessIterator1, RandomAccessIterator1>[k];
+
+              difference_type local_length = 0;
+              for (int s = 0; s < k; s++)
+                {
+                  chunks[s] = std::make_pair(
+                      seqs_begin[s].first + pieces[iam][s].first,
+                      seqs_begin[s].first + pieces[iam][s].second);
+                  local_length += LENGTH(chunks[s]);
+                }
+
+              multiway_merge(
+                    chunks, chunks + k, target + target_position, comp,
+                    std::min(local_length, length - target_position),
+                    stable, false, sequential_tag());
+
+              delete[] chunks;
+            }
+          else if (k == 2)
+            {
+              RandomAccessIterator1
+                  begin0 = seqs_begin[0].first + pieces[iam][0].first,
+                  begin1 = seqs_begin[1].first + pieces[iam][1].first;
+              merge_advance(begin0,
+                    seqs_begin[0].first + pieces[iam][0].second,
+                    begin1,
+                    seqs_begin[1].first + pieces[iam][1].second,
+                    target + target_position,
+                    (pieces[iam][0].second - pieces[iam][0].first) +
+                        (pieces[iam][1].second - pieces[iam][1].first),
+                    comp);
+            }
+        } //parallel
 
 #if _GLIBCXX_ASSERTIONS
-    _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target + length, comp));
+      _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target + length, comp));
 #endif
 
-    // Update ends of sequences.
-    for (int s = 0; s < k; s++)
-      seqs_begin[s].first += pieces[num_threads - 1][s].second;
+      // Update ends of sequences.
+      for (int s = 0; s < k; s++)
+        seqs_begin[s].first += pieces[num_threads - 1][s].second;
 
-    delete[] pieces;
+      delete[] pieces;
 
-    return target + length;
-  }
+      return target + length;
+    }
 
-  /** 
-   *  @brief Multi-way merging front-end.
-   *  @param seqs_begin Begin iterator of iterator pair input sequence.
-   *  @param seqs_end End iterator of iterator pair input sequence.
-   *  @param target Begin iterator out output sequence.
-   *  @param comp Comparator.
-   *  @param length Maximum length to merge.
-   *  @param stable Stable merging incurs a performance penalty.
-   *  @return End iterator of output sequence. 
-   */
-  template<typename RandomAccessIteratorPairIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+/**
+ *  @brief Multi-way merging front-end.
+ *  @param seqs_begin Begin iterator of iterator pair input sequence.
+ *  @param seqs_end End iterator of iterator pair input sequence.
+ *  @param target Begin iterator out output sequence.
+ *  @param comp Comparator.
+ *  @param length Maximum length to merge.
+ *  @param stable Stable merging incurs a performance penalty.
+ *  @return End iterator of output sequence.
+ */
+template<
+    typename RandomAccessIteratorPairIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
   multiway_merge(RandomAccessIteratorPairIterator seqs_begin,
-                RandomAccessIteratorPairIterator seqs_end,
-                RandomAccessIterator3 target, Comparator comp,
-                _DifferenceTp length, bool stable)
+                RandomAccessIteratorPairIterator seqs_end,
+                RandomAccessIterator3 target, Comparator comp,
+                _DifferenceTp length, bool stable)
   {
     typedef _DifferenceTp difference_type;
     _GLIBCXX_CALL(seqs_end - seqs_begin)
@@ -1535,33 +1770,43 @@ namespace __gnu_parallel
       return target;
 
     RandomAccessIterator3 target_end;
-    if (_GLIBCXX_PARALLEL_CONDITION(((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k) && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
-      target_end = parallel_multiway_merge(seqs_begin, seqs_end, target, comp, (difference_type)length, stable, false);
+    if (_GLIBCXX_PARALLEL_CONDITION(
+        ((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k)
+        && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
+      target_end = parallel_multiway_merge(
+          seqs_begin, seqs_end,
+          target, comp, static_cast<difference_type>(length), stable, false);
     else
-      target_end = multiway_merge(seqs_begin, seqs_end, target, comp, length, stable, false, sequential_tag());
+      target_end = multiway_merge(
+          seqs_begin, seqs_end,
+          target, comp, length, stable, false, sequential_tag());
 
     return target_end;
   }
 
-  /** @brief Multi-way merging front-end.
-   *  @param seqs_begin Begin iterator of iterator pair input sequence.
-   *  @param seqs_end End iterator of iterator pair input sequence.
-   *  @param target Begin iterator out output sequence.
-   *  @param comp Comparator.
-   *  @param length Maximum length to merge.
-   *  @param stable Stable merging incurs a performance penalty.
-   *  @return End iterator of output sequence.
-   *  @pre For each @c i, @c seqs_begin[i].second must be the end
-   *  marker of the sequence, but also reference the one more sentinel
-   *  element. */
-  template<typename RandomAccessIteratorPairIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+/** @brief Multi-way merging front-end.
+ *  @param seqs_begin Begin iterator of iterator pair input sequence.
+ *  @param seqs_end End iterator of iterator pair input sequence.
+ *  @param target Begin iterator out output sequence.
+ *  @param comp Comparator.
+ *  @param length Maximum length to merge.
+ *  @param stable Stable merging incurs a performance penalty.
+ *  @return End iterator of output sequence.
+ *  @pre For each @c i, @c seqs_begin[i].second must be the end
+ *  marker of the sequence, but also reference the one more sentinel
+ *  element. */
+template<
+    typename RandomAccessIteratorPairIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
   multiway_merge_sentinel(RandomAccessIteratorPairIterator seqs_begin,
-                         RandomAccessIteratorPairIterator seqs_end,
-                         RandomAccessIterator3 target,
-                         Comparator comp,
-                         _DifferenceTp length,
-                         bool stable)
+                          RandomAccessIteratorPairIterator seqs_end,
+                          RandomAccessIterator3 target,
+                          Comparator comp,
+                          _DifferenceTp length,
+                          bool stable)
   {
     typedef _DifferenceTp difference_type;
 
@@ -1570,10 +1815,16 @@ namespace __gnu_parallel
 
     _GLIBCXX_CALL(seqs_end - seqs_begin)
 
-    if (_GLIBCXX_PARALLEL_CONDITION(((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k) && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
-      return parallel_multiway_merge(seqs_begin, seqs_end, target, comp, (typename std::iterator_traits<RandomAccessIterator3>::difference_type)length, stable, true);
+    if (_GLIBCXX_PARALLEL_CONDITION(
+        ((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k)
+        && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
+      return parallel_multiway_merge(
+          seqs_begin, seqs_end,
+          target, comp, static_cast<difference_type>(length), stable, true);
     else
-      return multiway_merge(seqs_begin, seqs_end, target, comp, length, stable, true, sequential_tag());
+      return multiway_merge(
+          seqs_begin, seqs_end,
+          target, comp, length, stable, true, sequential_tag());
   }
 }
 
index 89285e1..5dedd00 100644 (file)
@@ -48,8 +48,8 @@
 namespace __gnu_parallel
 {
 
-  /** @brief Subsequence description. */
-  template<typename _DifferenceTp>
+/** @brief Subsequence description. */
+template<typename _DifferenceTp>
   struct Piece
   {
     typedef _DifferenceTp difference_type;
@@ -61,16 +61,19 @@ namespace __gnu_parallel
     difference_type end;
   };
 
-  /** @brief Data accessed by all threads.
-   *
-   *  PMWMS = parallel multiway mergesort */
-  template<typename RandomAccessIterator>
+/** @brief Data accessed by all threads.
+  *
+  *  PMWMS = parallel multiway mergesort */
+template<typename RandomAccessIterator>
   struct PMWMSSortingData
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
     typedef typename traits_type::difference_type difference_type;
 
+    /** @brief Number of threads involved. */
+    thread_index_t num_threads;
+
     /** @brief Input begin. */
     RandomAccessIterator source;
 
@@ -105,62 +108,55 @@ namespace __gnu_parallel
 
     /** @brief Pieces of data to merge @c [thread][sequence] */
     std::vector<Piece<difference_type> >* pieces;
-  };
 
-  /** @brief Thread local data for PMWMS. */
-  template<typename RandomAccessIterator>
-  struct PMWMSSorterPU
-  {
-    /** @brief Total number of thread involved. */
-    thread_index_t num_threads;
-    /** @brief Number of owning thread. */
-    thread_index_t iam;
     /** @brief Stable sorting desired. */
     bool stable;
-    /** @brief Pointer to global data. */
-    PMWMSSortingData<RandomAccessIterator>* sd;
-  };
-
-  /** 
-   *  @brief Select samples from a sequence.
-   *  @param d Pointer to thread-local data. Result will be placed in
-   *  @c d->ds->samples.
-   *  @param num_samples Number of samples to select. 
-   */
-  template<typename RandomAccessIterator, typename _DifferenceTp>
+};
+
+/**
+  *  @brief Select samples from a sequence.
+  *  @param sd Pointer to algorithm data. Result will be placed in
+  *  @c sd->samples.
+  *  @param num_samples Number of samples to select.
+  */
+template<typename RandomAccessIterator, typename _DifferenceTp>
   inline void 
-  determine_samples(PMWMSSorterPU<RandomAccessIterator>* d, 
-                   _DifferenceTp& num_samples)
+  determine_samples(PMWMSSortingData<RandomAccessIterator>* sd,
+                    _DifferenceTp& num_samples)
   {
     typedef _DifferenceTp difference_type;
 
-    PMWMSSortingData<RandomAccessIterator>* sd = d->sd;
+    thread_index_t iam = omp_get_thread_num();
 
-    num_samples = Settings::sort_mwms_oversampling * d->num_threads - 1;
+    num_samples =
+        Settings::sort_mwms_oversampling * sd->num_threads - 1;
 
-    difference_type* es = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_samples + 2)));
+    difference_type* es = new difference_type[num_samples + 2];
 
-    equally_split(sd->starts[d->iam + 1] - sd->starts[d->iam], num_samples + 1, es);
+    equally_split(sd->starts[iam + 1] - sd->starts[iam], 
+                  num_samples + 1, es);
 
     for (difference_type i = 0; i < num_samples; i++)
-      sd->samples[d->iam * num_samples + i] = sd->source[sd->starts[d->iam] + es[i + 1]];
+      sd->samples[iam * num_samples + i] =
+          sd->source[sd->starts[iam] + es[i + 1]];
+
+    delete[] es;
   }
 
-  /** @brief PMWMS code executed by each thread.
-   *  @param d Pointer to thread-local data.
-   *  @param comp Comparator. 
-   */
-  template<typename RandomAccessIterator, typename Comparator>
+/** @brief PMWMS code executed by each thread.
+  *  @param sd Pointer to algorithm data.
+  *  @param comp Comparator.
+  */
+template<typename RandomAccessIterator, typename Comparator>
   inline void 
-  parallel_sort_mwms_pu(PMWMSSorterPU<RandomAccessIterator>* d, 
-                       Comparator& comp)
+  parallel_sort_mwms_pu(PMWMSSortingData<RandomAccessIterator>* sd,
+                        Comparator& comp)
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
     typedef typename traits_type::difference_type difference_type;
 
-    PMWMSSortingData<RandomAccessIterator>* sd = d->sd;
-    thread_index_t iam = d->iam;
+    thread_index_t iam = omp_get_thread_num();
 
     // Length of this thread's chunk, before merging.
     difference_type length_local = sd->starts[iam + 1] - sd->starts[iam];
@@ -174,161 +170,168 @@ namespace __gnu_parallel
     typedef value_type* SortingPlacesIterator;
 
     // Sort in temporary storage, leave space for sentinel.
-    sd->sorting_places[iam] = sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * (length_local + 1)));
+    sd->sorting_places[iam] = sd->temporaries[iam] = 
+        static_cast<value_type*>(
+        ::operator new(sizeof(value_type) * (length_local + 1)));
 
     // Copy there.
-    std::uninitialized_copy(sd->source + sd->starts[iam], sd->source + sd->starts[iam] + length_local, sd->sorting_places[iam]);
+    std::uninitialized_copy(sd->source + sd->starts[iam],
+                            sd->source + sd->starts[iam] + length_local,
+                            sd->sorting_places[iam]);
 #endif
 
     // Sort locally.
-    if (d->stable)
-      __gnu_sequential::stable_sort(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp);
+    if (sd->stable)
+      __gnu_sequential::stable_sort(sd->sorting_places[iam],
+                                    sd->sorting_places[iam] + length_local,
+                                    comp);
     else
-      __gnu_sequential::sort(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp);
-
-#if _GLIBCXX_ASSERTIONS
-    _GLIBCXX_PARALLEL_ASSERT(is_sorted(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp));
-#endif
+      __gnu_sequential::sort(sd->sorting_places[iam],
+                             sd->sorting_places[iam] + length_local,
+                             comp);
 
     // Invariant: locally sorted subsequence in sd->sorting_places[iam],
     // sd->sorting_places[iam] + length_local.
 
     if (Settings::sort_splitting == Settings::SAMPLING)
       {
-       difference_type num_samples;
-       determine_samples(d, num_samples);
-
-#pragma omp barrier
-
-#pragma omp single
-       __gnu_sequential::sort(sd->samples, 
-                              sd->samples + (num_samples * d->num_threads), 
-                              comp);
-
-#pragma omp barrier
-
-       for (int s = 0; s < d->num_threads; s++)
-         {
-           // For each sequence.
-           if (num_samples * iam > 0)
-             sd->pieces[iam][s].begin = std::lower_bound(sd->sorting_places[s],
-                                sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s],
-                                sd->samples[num_samples * iam],
-                                comp)
-               - sd->sorting_places[s];
-           else
-             // Absolute beginning.
-             sd->pieces[iam][s].begin = 0;
-
-           if ((num_samples * (iam + 1)) < (num_samples * d->num_threads))
-             sd->pieces[iam][s].end = std::lower_bound(sd->sorting_places[s],
-                                                       sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s], sd->samples[num_samples * (iam + 1)], comp)
-               - sd->sorting_places[s];
-           else
-             // Absolute end.
-             sd->pieces[iam][s].end = sd->starts[s + 1] - sd->starts[s];
-         }
-
+        difference_type num_samples;
+        determine_samples(sd, num_samples);
+
+#       pragma omp barrier
+
+#       pragma omp single
+        __gnu_sequential::sort(sd->samples,
+                               sd->samples + (num_samples * sd->num_threads),
+                               comp);
+
+#       pragma omp barrier
+
+        for (int s = 0; s < sd->num_threads; s++)
+          {
+            // For each sequence.
+              if (num_samples * iam > 0)
+                sd->pieces[iam][s].begin = 
+                    std::lower_bound(sd->sorting_places[s],
+                        sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s],
+                        sd->samples[num_samples * iam],
+                        comp)
+                    - sd->sorting_places[s];
+            else
+              // Absolute beginning.
+              sd->pieces[iam][s].begin = 0;
+
+            if ((num_samples * (iam + 1)) < (num_samples * sd->num_threads))
+              sd->pieces[iam][s].end =
+                  std::lower_bound(sd->sorting_places[s],
+                                   sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s],
+                                   sd->samples[num_samples * (iam + 1)], comp)
+                  - sd->sorting_places[s];
+            else
+              // Absolute end.
+              sd->pieces[iam][s].end = sd->starts[s + 1] - sd->starts[s];
+            }
       }
     else if (Settings::sort_splitting == Settings::EXACT)
       {
-#pragma omp barrier
-
-       std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> > seqs(d->num_threads);
-       for (int s = 0; s < d->num_threads; s++)
-         seqs[s] = std::make_pair(sd->sorting_places[s], sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s]);
-
-       std::vector<SortingPlacesIterator> offsets(d->num_threads);
-
-       // If not last thread.
-       if (iam < d->num_threads - 1)
-         multiseq_partition(seqs.begin(), seqs.end(), sd->starts[iam + 1], offsets.begin(), comp);
-
-       for (int seq = 0; seq < d->num_threads; seq++)
-         {
-           // For each sequence.
-           if (iam < (d->num_threads - 1))
-             sd->pieces[iam][seq].end = offsets[seq] - seqs[seq].first;
-           else
-             // Absolute end of this sequence.
-             sd->pieces[iam][seq].end = sd->starts[seq + 1] - sd->starts[seq];
-         }
-
-#pragma omp barrier
-
-       for (int seq = 0; seq < d->num_threads; seq++)
-         {
-           // For each sequence.
-           if (iam > 0)
-             sd->pieces[iam][seq].begin = sd->pieces[iam - 1][seq].end;
-           else
-             // Absolute beginning.
-             sd->pieces[iam][seq].begin = 0;
-         }
+#       pragma omp barrier
+
+        std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
+            seqs(sd->num_threads);
+        for (int s = 0; s < sd->num_threads; s++)
+          seqs[s] = std::make_pair(sd->sorting_places[s],
+                                   sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s]);
+
+        std::vector<SortingPlacesIterator> offsets(sd->num_threads);
+
+        // if not last thread
+        if (iam < sd->num_threads - 1)
+          multiseq_partition(seqs.begin(), seqs.end(),
+                             sd->starts[iam + 1], offsets.begin(), comp);
+
+        for (int seq = 0; seq < sd->num_threads; seq++)
+          {
+            // for each sequence
+            if (iam < (sd->num_threads - 1))
+              sd->pieces[iam][seq].end = offsets[seq] - seqs[seq].first;
+            else
+              // very end of this sequence
+              sd->pieces[iam][seq].end = sd->starts[seq + 1] - sd->starts[seq];
+          }
+
+#       pragma omp barrier
+
+        for (int seq = 0; seq < sd->num_threads; seq++)
+          {
+            // For each sequence.
+            if (iam > 0)
+              sd->pieces[iam][seq].begin = sd->pieces[iam - 1][seq].end;
+            else
+              // Absolute beginning.
+              sd->pieces[iam][seq].begin = 0;
+          }
       }
 
     // Offset from target begin, length after merging.
     difference_type offset = 0, length_am = 0;
-    for (int s = 0; s < d->num_threads; s++)
+    for (int s = 0; s < sd->num_threads; s++)
       {
-       length_am += sd->pieces[iam][s].end - sd->pieces[iam][s].begin;
-       offset += sd->pieces[iam][s].begin;
+        length_am += sd->pieces[iam][s].end - sd->pieces[iam][s].begin;
+        offset += sd->pieces[iam][s].begin;
       }
 
 #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
     // Merge to temporary storage, uninitialized creation not possible
     // since there is no multiway_merge calling the placement new
     // instead of the assignment operator.
-    sd->merging_places[iam] = sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * length_am));
+    sd->merging_places[iam] = sd->temporaries[iam] =
+        static_cast<value_type*>(
+        ::operator new(sizeof(value_type) * length_am));
 #else
     // Merge directly to target.
     sd->merging_places[iam] = sd->source + offset;
 #endif
-    std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> > seqs(d->num_threads);
+    std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
+        seqs(sd->num_threads);
 
-    for (int s = 0; s < d->num_threads; s++)
+    for (int s = 0; s < sd->num_threads; s++)
       {
-       seqs[s] = std::make_pair(sd->sorting_places[s] + sd->pieces[iam][s].begin, sd->sorting_places[s] + sd->pieces[iam][s].end);
-
-#if _GLIBCXX_ASSERTIONS
-       _GLIBCXX_PARALLEL_ASSERT(is_sorted(seqs[s].first, seqs[s].second, comp));
-#endif
+        seqs[s] = std::make_pair(sd->sorting_places[s] + sd->pieces[iam][s].begin,
+                                 sd->sorting_places[s] + sd->pieces[iam][s].end);
       }
 
-    multiway_merge(seqs.begin(), seqs.end(), sd->merging_places[iam], comp, length_am, d->stable, false, sequential_tag());
-
-#if _GLIBCXX_ASSERTIONS
-    _GLIBCXX_PARALLEL_ASSERT(is_sorted(sd->merging_places[iam], sd->merging_places[iam] + length_am, comp));
-#endif
+    multiway_merge(seqs.begin(), seqs.end(), sd->merging_places[iam], comp, length_am, sd->stable, false, sequential_tag());
 
-#      pragma omp barrier
+#   pragma omp barrier
 
 #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
     // Write back.
-    std::copy(sd->merging_places[iam], sd->merging_places[iam] + length_am, 
-             sd->source + offset);
+    std::copy(sd->merging_places[iam],
+              sd->merging_places[iam] + length_am,
+              sd->source + offset);
 #endif
 
     delete[] sd->temporaries[iam];
   }
 
-  /** @brief PMWMS main call.
-   *  @param begin Begin iterator of sequence.
-   *  @param end End iterator of sequence.
-   *  @param comp Comparator.
-   *  @param n Length of sequence.
-   *  @param num_threads Number of threads to use.
-   *  @param stable Stable sorting.
-   */
-  template<typename RandomAccessIterator, typename Comparator>
+/** @brief PMWMS main call.
+  *  @param begin Begin iterator of sequence.
+  *  @param end End iterator of sequence.
+  *  @param comp Comparator.
+  *  @param n Length of sequence.
+  *  @param num_threads Number of threads to use.
+  *  @param stable Stable sorting.
+  */
+template<typename RandomAccessIterator, typename Comparator>
   inline void
-  parallel_sort_mwms(RandomAccessIterator begin, RandomAccessIterator end, 
-                    Comparator comp, 
-       typename std::iterator_traits<RandomAccessIterator>::difference_type n, 
-                    int num_threads, bool stable)
+  parallel_sort_mwms(RandomAccessIterator begin, RandomAccessIterator end,
+                     Comparator comp, 
+                     typename std::iterator_traits<RandomAccessIterator>::difference_type n,
+                     int num_threads,
+                     bool stable)
   {
     _GLIBCXX_CALL(n)
-      
+
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
     typedef typename traits_type::difference_type difference_type;
@@ -336,75 +339,75 @@ namespace __gnu_parallel
     if (n <= 1)
       return;
 
-    // At least one element per thread.
+    // at least one element per thread
     if (num_threads > n)
       num_threads = static_cast<thread_index_t>(n);
 
+    // shared variables
     PMWMSSortingData<RandomAccessIterator> sd;
+    difference_type* starts;
 
-    sd.source = begin;
-    sd.temporaries = new value_type*[num_threads];
+#   pragma omp parallel num_threads(num_threads)
+      {
+        num_threads = omp_get_num_threads();  //no more threads than requested
+
+#       pragma omp single
+          {
+            sd.num_threads = num_threads;
+            sd.source = begin;
+            sd.temporaries = new value_type*[num_threads];
 
 #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
-    sd.sorting_places = new RandomAccessIterator[num_threads];
-    sd.merging_places = new value_type*[num_threads];
+            sd.sorting_places = new RandomAccessIterator[num_threads];
+            sd.merging_places = new value_type*[num_threads];
 #else
-    sd.sorting_places = new value_type*[num_threads];
-    sd.merging_places = new RandomAccessIterator[num_threads];
+            sd.sorting_places = new value_type*[num_threads];
+            sd.merging_places = new RandomAccessIterator[num_threads];
 #endif
 
-    if (Settings::sort_splitting == Settings::SAMPLING)
-      {
-       unsigned int sz = Settings::sort_mwms_oversampling * num_threads - 1;
-       sz *= num_threads;
-       
-       // Equivalent to value_type[sz], without need of default construction.
-       sz *= sizeof(value_type);
-       sd.samples = static_cast<value_type*>(::operator new(sz));
-      }
-    else
-      sd.samples = NULL;
-
-    sd.offsets = new difference_type[num_threads - 1];
-    sd.pieces = new std::vector<Piece<difference_type> >[num_threads];
-    for (int s = 0; s < num_threads; s++)
-      sd.pieces[s].resize(num_threads);
-    PMWMSSorterPU<RandomAccessIterator>* pus = new PMWMSSorterPU<RandomAccessIterator>[num_threads];
-    difference_type* starts = sd.starts = new difference_type[num_threads + 1];
-
-    difference_type chunk_length = n / num_threads;
-    difference_type split = n % num_threads;
-    difference_type start = 0;
-    for (int i = 0; i < num_threads; i++)
-      {
-       starts[i] = start;
-       start += (i < split) ? (chunk_length + 1) : chunk_length;
-       pus[i].num_threads = num_threads;
-       pus[i].iam = i;
-       pus[i].sd = &sd;
-       pus[i].stable = stable;
-      }
-    starts[num_threads] = start;
-
-    // Now sort in parallel.
-#pragma omp parallel num_threads(num_threads)
-    parallel_sort_mwms_pu(&(pus[omp_get_thread_num()]), comp);
+            if (Settings::sort_splitting == Settings::SAMPLING)
+              {
+                unsigned int size = 
+                    (Settings::sort_mwms_oversampling * num_threads - 1) * num_threads;
+                sd.samples = static_cast<value_type*>(
+                    ::operator new(size * sizeof(value_type)));
+              }
+            else
+              sd.samples = NULL;
+
+            sd.offsets = new difference_type[num_threads - 1];
+            sd.pieces = new std::vector<Piece<difference_type> >[num_threads];
+            for (int s = 0; s < num_threads; s++)
+              sd.pieces[s].resize(num_threads);
+            starts = sd.starts = new difference_type[num_threads + 1];
+            sd.stable = stable;
+
+            difference_type chunk_length = n / num_threads;
+            difference_type split = n % num_threads;
+            difference_type pos = 0;
+            for (int i = 0; i < num_threads; i++)
+              {
+                starts[i] = pos;
+                pos += (i < split) ? (chunk_length + 1) : chunk_length;
+              }
+            starts[num_threads] = pos;
+          }
+
+        // Now sort in parallel.
+        parallel_sort_mwms_pu(&sd, comp);
+      } //parallel
 
-    // XXX sd as RAII
     delete[] starts;
     delete[] sd.temporaries;
     delete[] sd.sorting_places;
     delete[] sd.merging_places;
 
     if (Settings::sort_splitting == Settings::SAMPLING)
-      delete[] sd.samples;
+        delete[] sd.samples;
 
     delete[] sd.offsets;
     delete[] sd.pieces;
-
-    delete[] pus;
   }
-
-}
+} //namespace __gnu_parallel
 
 #endif
index 23fe6f4..0a992b0 100644 (file)
 
 #include <parallel/settings.h>
 #include <parallel/basic_iterator.h>
+#include <parallel/base.h>
 
 namespace __gnu_parallel
 {
-  /** @brief Embarrassingly parallel algorithm for random access
-   * iterators, using an OpenMP for loop.
-   *
-   *  @param begin Begin iterator of element sequence.
-   *  @param end End iterator of element sequence.
-   *  @param o User-supplied functor (comparator, predicate, adding
-   *  functor, etc.).
-   *  @param f Functor to "process" an element with op (depends on
-   *  desired functionality, e. g. for std::for_each(), ...).
-   *  @param r Functor to "add" a single result to the already
-   *  processed elements (depends on functionality).
-   *  @param base Base value for reduction.
-   *  @param output Pointer to position where final result is written to
-   *  @param bound Maximum number of elements processed (e. g. for
-   *  std::count_n()).
-   *  @return User-supplied functor (that may contain a part of the result).
-   */
-  template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
+/** @brief Embarrassingly parallel algorithm for random access
+  * iterators, using an OpenMP for loop.
+  *
+  *  @param begin Begin iterator of element sequence.
+  *  @param end End iterator of element sequence.
+  *  @param o User-supplied functor (comparator, predicate, adding
+  *  functor, etc.).
+  *  @param f Functor to "process" an element with op (depends on
+  *  desired functionality, e. g. for std::for_each(), ...).
+  *  @param r Functor to "add" a single result to the already
+  *  processed elements (depends on functionality).
+  *  @param base Base value for reduction.
+  *  @param output Pointer to position where final result is written to
+  *  @param bound Maximum number of elements processed (e. g. for
+  *  std::count_n()).
+  *  @return User-supplied functor (that may contain a part of the result).
+  */
+template<typename RandomAccessIterator,
+            typename Op,
+            typename Fu,
+            typename Red,
+            typename Result>
   Op
-  for_each_template_random_access_omp_loop(RandomAccessIterator begin, RandomAccessIterator end, Op o, Fu& f, Red r, Result base, Result& output, typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
+  for_each_template_random_access_omp_loop(
+             RandomAccessIterator begin,
+             RandomAccessIterator end,
+             Op o, Fu& f, Red r, Result base, Result& output,
+             typename std::iterator_traits<RandomAccessIterator>::
+                 difference_type bound)
   {
-    typedef typename std::iterator_traits<RandomAccessIterator>::difference_type difference_type;
+    typedef typename
+        std::iterator_traits<RandomAccessIterator>::difference_type
+        difference_type;
 
-    thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : static_cast<thread_index_t>((end - begin));
-    Result *thread_results = new Result[num_threads];
     difference_type length = end - begin;
+    thread_index_t num_threads =
+        __gnu_parallel::min<difference_type>(get_max_threads(), length);
 
-    for (thread_index_t i = 0; i < num_threads; i++)
+    Result *thread_results;
+
+#   pragma omp parallel num_threads(num_threads)
       {
-       thread_results[i] = r(thread_results[i], f(o, begin+i));
-      }
-
-#pragma omp parallel num_threads(num_threads)
-    {
-#pragma omp for schedule(dynamic, Settings::workstealing_chunk_size)
-      for (difference_type pos = 0; pos < length; pos++)
-       {
-         thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
-       }
-    }
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+            thread_results = new Result[num_threads];
+
+            for (thread_index_t i = 0; i < num_threads; i++)
+              thread_results[i] = Result();
+          }
+
+        thread_index_t iam = omp_get_thread_num();
+
+#       pragma omp for schedule(dynamic, Settings::workstealing_chunk_size)
+        for (difference_type pos = 0; pos < length; pos++)
+          thread_results[iam] =
+              r(thread_results[iam], f(o, begin+pos));
+      } //parallel
 
     for (thread_index_t i = 0; i < num_threads; i++)
-      {
-       output = r(output, thread_results[i]);
-      }
+        output = r(output, thread_results[i]);
 
     delete [] thread_results;
 
@@ -100,6 +117,7 @@ namespace __gnu_parallel
 
     return o;
   }
+
 } // end namespace
 
 #endif
index 22acb2d..df2c3b5 100644 (file)
@@ -64,39 +64,50 @@ namespace __gnu_parallel
    *  std::count_n()).
    *  @return User-supplied functor (that may contain a part of the result).
    */
-  template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
+template<typename RandomAccessIterator,
+          typename Op,
+          typename Fu,
+          typename Red,
+          typename Result>
   Op
-  for_each_template_random_access_omp_loop_static(RandomAccessIterator begin,
-                                                 RandomAccessIterator end,
-                                                 Op o, Fu& f, Red r,
-                                                 Result base, Result& output,
-                                                 typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
+  for_each_template_random_access_omp_loop_static(
+              RandomAccessIterator begin,
+              RandomAccessIterator end,
+              Op o, Fu& f, Red r, Result base, Result& output,
+              typename std::iterator_traits<RandomAccessIterator>::
+                  difference_type bound)
   {
-    typedef std::iterator_traits<RandomAccessIterator> traits_type;
-    typedef typename traits_type::difference_type difference_type;
+    typedef typename
+        std::iterator_traits<RandomAccessIterator>::difference_type
+        difference_type;
 
-    thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : (end - begin);
-    Result *thread_results = new Result[num_threads];
     difference_type length = end - begin;
+    thread_index_t num_threads =
+        std::min<difference_type>(get_max_threads(), length);
 
-    for (thread_index_t i = 0; i < num_threads; i++)
+    Result *thread_results;
+
+#   pragma omp parallel num_threads(num_threads)
       {
-       thread_results[i] = r(thread_results[i], f(o, begin+i));
-      }
-
-#pragma omp parallel num_threads(num_threads)
-    {
-#pragma omp for schedule(static, Settings::workstealing_chunk_size)
-      for (difference_type pos = 0; pos < length; pos++)
-       {
-         thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
-       }
-    }
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+            thread_results = new Result[num_threads];
+
+            for (thread_index_t i = 0; i < num_threads; i++)
+              thread_results[i] = Result();
+          }
+
+        thread_index_t iam = omp_get_thread_num();
+
+#       pragma omp for schedule(static, Settings::workstealing_chunk_size)
+        for (difference_type pos = 0; pos < length; pos++)
+          thread_results[iam] =
+              r(thread_results[iam], f(o, begin+pos));
+      } //parallel
 
     for (thread_index_t i = 0; i < num_threads; i++)
-      {
-       output = r(output, thread_results[i]);
-      }
+        output = r(output, thread_results[i]);
 
     delete [] thread_results;
 
@@ -106,6 +117,7 @@ namespace __gnu_parallel
 
     return o;
   }
+
 } // end namespace
 
 #endif
index 98604cf..6954e74 100644 (file)
 
 #include <omp.h>
 #include <parallel/settings.h>
+#include <parallel/base.h>
 
 namespace __gnu_parallel
 {
 
-  /** @brief Embarrassingly parallel algorithm for random access
-   * iterators, using hand-crafted parallelization by equal splitting
-   * the work.
-   *
-   *  @param begin Begin iterator of element sequence.
-   *  @param end End iterator of element sequence.
-   *  @param o User-supplied functor (comparator, predicate, adding
-   *  functor, ...)
-   *  @param f Functor to "process" an element with op (depends on
-   *  desired functionality, e. g. for std::for_each(), ...).
-   *  @param r Functor to "add" a single result to the already
-   *  processed elements (depends on functionality).
-   *  @param base Base value for reduction.
-   *  @param output Pointer to position where final result is written to
-   *  @param bound Maximum number of elements processed (e. g. for
-   *  std::count_n()).
-   *  @return User-supplied functor (that may contain a part of the result).
-   */
-  template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
+/** @brief Embarrassingly parallel algorithm for random access
+  * iterators, using hand-crafted parallelization by equal splitting
+  * the work.
+  *
+  *  @param begin Begin iterator of element sequence.
+  *  @param end End iterator of element sequence.
+  *  @param o User-supplied functor (comparator, predicate, adding
+  *  functor, ...)
+  *  @param f Functor to "process" an element with op (depends on
+  *  desired functionality, e. g. for std::for_each(), ...).
+  *  @param r Functor to "add" a single result to the already
+  *  processed elements (depends on functionality).
+  *  @param base Base value for reduction.
+  *  @param output Pointer to position where final result is written to
+  *  @param bound Maximum number of elements processed (e. g. for
+  *  std::count_n()).
+  *  @return User-supplied functor (that may contain a part of the result).
+  */
+template<
+    typename RandomAccessIterator,
+    typename Op,
+    typename Fu,
+    typename Red,
+    typename Result>
   Op
-  for_each_template_random_access_ed(RandomAccessIterator begin,
-                                    RandomAccessIterator end, Op o, Fu& f,
-                                    Red r, Result base, Result& output,
-                                    typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
+  for_each_template_random_access_ed(
+              RandomAccessIterator begin,
+              RandomAccessIterator end,
+              Op o, Fu& f, Red r, Result base, Result& output,
+              typename std::iterator_traits<RandomAccessIterator>::
+                  difference_type bound)
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::difference_type difference_type;
 
     const difference_type length = end - begin;
-    const difference_type settings_threads = static_cast<difference_type>(get_max_threads());
-    const difference_type dmin = settings_threads < length ? settings_threads : length;
-    const difference_type dmax = dmin > 1 ? dmin : 1;
+    Result *thread_results;
 
-    thread_index_t num_threads = static_cast<thread_index_t>(dmax);
+    thread_index_t num_threads =
+        __gnu_parallel::min<difference_type>(get_max_threads(), length);
 
+#   pragma omp parallel num_threads(num_threads)
+      {
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+            thread_results = new Result[num_threads];
+          }
 
-    Result *thread_results = new Result[num_threads];
+        thread_index_t iam = omp_get_thread_num();
 
-#pragma omp parallel num_threads(num_threads)
-    {
-      // Neutral element.
-      Result reduct = Result();
+        // Neutral element.
+        Result reduct = Result();
 
-      thread_index_t p = num_threads;
-      thread_index_t iam = omp_get_thread_num();
-      difference_type start = iam * length / p;
-      difference_type limit = (iam == p - 1) ? length : (iam + 1) * length / p;
+        difference_type
+            start = equally_split_point(length, num_threads, iam),
+            stop = equally_split_point(length, num_threads, iam + 1);
 
-      if (start < limit)
-       {
-         reduct = f(o, begin + start);
-         start++;
-       }
+        if (start < stop)
+          {
+            reduct = f(o, begin + start);
+            ++start;
+          }
 
-      for (; start < limit; start++)
-       reduct = r(reduct, f(o, begin + start));
+        for (; start < stop; ++start)
+          reduct = r(reduct, f(o, begin + start));
 
-      thread_results[iam] = reduct;
-    }
+        thread_results[iam] = reduct;
+      } //parallel
 
     for (thread_index_t i = 0; i < num_threads; i++)
       output = r(output, thread_results[i]);
index e7652c0..3dfce86 100644 (file)
@@ -48,130 +48,156 @@ namespace __gnu_parallel
 {
   // Problem: there is no 0-element given.
 
-  /** @brief Base case prefix sum routine.
-   *  @param begin Begin iterator of input sequence.
-   *  @param end End iterator of input sequence.
-   *  @param result Begin iterator of output sequence.
-   *  @param bin_op Associative binary function.
-   *  @param value Start value. Must be passed since the neutral
-   *  element is unknown in general.
-   *  @return End iterator of output sequence. */
-  template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
+/** @brief Base case prefix sum routine.
+  *  @param begin Begin iterator of input sequence.
+  *  @param end End iterator of input sequence.
+  *  @param result Begin iterator of output sequence.
+  *  @param bin_op Associative binary function.
+  *  @param value Start value. Must be passed since the neutral
+  *  element is unknown in general.
+  *  @return End iterator of output sequence. */
+template<
+    typename InputIterator,
+    typename OutputIterator,
+    typename BinaryOperation>
   inline OutputIterator
-  parallel_partial_sum_basecase(InputIterator begin, InputIterator end,
-                               OutputIterator result, BinaryOperation bin_op,
-                               typename std::iterator_traits<InputIterator>::value_type value)
+  parallel_partial_sum_basecase(
+            InputIterator begin, InputIterator end,
+            OutputIterator result, BinaryOperation bin_op,
+            typename std::iterator_traits<InputIterator>::value_type value)
   {
     if (begin == end)
       return result;
 
     while (begin != end)
       {
-       value = bin_op(value, *begin);
-       *result = value;
-       result++;
-       begin++;
+        value = bin_op(value, *begin);
+        *result = value;
+        result++;
+        begin++;
       }
     return result;
   }
 
-  /** @brief Parallel partial sum implementation, two-phase approach,
-      no recursion.
-      *  @param begin Begin iterator of input sequence.
-      *  @param end End iterator of input sequence.
-      *  @param result Begin iterator of output sequence.
-      *  @param bin_op Associative binary function.
-      *  @param n Length of sequence.
-      *  @param num_threads Number of threads to use.
-      *  @return End iterator of output sequence.
-      */
-  template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
+/** @brief Parallel partial sum implementation, two-phase approach,
+    no recursion.
+    *  @param begin Begin iterator of input sequence.
+    *  @param end End iterator of input sequence.
+    *  @param result Begin iterator of output sequence.
+    *  @param bin_op Associative binary function.
+    *  @param n Length of sequence.
+    *  @param num_threads Number of threads to use.
+    *  @return End iterator of output sequence.
+    */
+template<
+    typename InputIterator,
+    typename OutputIterator,
+    typename BinaryOperation>
   OutputIterator
-  parallel_partial_sum_linear(InputIterator begin, InputIterator end,
-                             OutputIterator result, BinaryOperation bin_op,
-                             typename std::iterator_traits<InputIterator>::difference_type n, int num_threads)
+  parallel_partial_sum_linear(
+            InputIterator begin, InputIterator end,
+            OutputIterator result, BinaryOperation bin_op,
+            typename std::iterator_traits<InputIterator>::difference_type n)
   {
     typedef std::iterator_traits<InputIterator> traits_type;
     typedef typename traits_type::value_type value_type;
     typedef typename traits_type::difference_type difference_type;
 
-    if (num_threads > (n - 1))
-      num_threads = static_cast<thread_index_t>(n - 1);
+    thread_index_t num_threads =
+        std::min<difference_type>(get_max_threads(), n - 1);
+
     if (num_threads < 2)
       {
-       *result = *begin;
-       return parallel_partial_sum_basecase(begin + 1, end, result + 1, bin_op, *begin);
+        *result = *begin;
+        return parallel_partial_sum_basecase(
+            begin + 1, end, result + 1, bin_op, *begin);
       }
 
-    difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 2)));
+    difference_type* borders;
+    value_type* sums;
 
-    if (Settings::partial_sum_dilatation == 1.0f)
-      equally_split(n, num_threads + 1, borders);
-    else
+#   pragma omp parallel num_threads(num_threads)
       {
-       difference_type chunk_length = (int)((double)n / ((double)num_threads + Settings::partial_sum_dilatation)), borderstart = n - num_threads * chunk_length;
-       borders[0] = 0;
-       for (int i = 1; i < (num_threads + 1); i++)
-         {
-           borders[i] = borderstart;
-           borderstart += chunk_length;
-         }
-       borders[num_threads + 1] = n;
-      }
-
-    value_type* sums = static_cast<value_type*>(::operator new(sizeof(value_type) * num_threads));
-    OutputIterator target_end;
-
-#pragma omp parallel num_threads(num_threads)
-    {
-      int id = omp_get_thread_num();
-      if (id == 0)
-       {
-         *result = *begin;
-         parallel_partial_sum_basecase(begin + 1, begin + borders[1], 
-                                       result + 1, bin_op, *begin);
-         sums[0] = *(result + borders[1] - 1);
-       }
-      else
-       {
-         sums[id] = std::accumulate(begin + borders[id] + 1, 
-                                    begin + borders[id + 1], 
-                                    *(begin + borders[id]), 
-                                    bin_op, __gnu_parallel::sequential_tag());
-       }
-
-#pragma omp barrier
-
-#pragma omp single
-      parallel_partial_sum_basecase(sums + 1, sums + num_threads, sums + 1, 
-                                   bin_op, sums[0]);
-
-#pragma omp barrier
-
-      // Still same team.
-      parallel_partial_sum_basecase(begin + borders[id + 1], 
-                                   begin + borders[id + 2], 
-                                   result + borders[id + 1], bin_op, 
-                                   sums[id]);
-    }
-
-    delete [] sums;
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+
+            borders = new difference_type[num_threads + 2];
+
+            if (Settings::partial_sum_dilatation == 1.0f)
+              equally_split(n, num_threads + 1, borders);
+            else
+              {
+                difference_type chunk_length =
+                    ((double)n /
+                    ((double)num_threads + Settings::partial_sum_dilatation)),
+                    borderstart = n - num_threads * chunk_length;
+                borders[0] = 0;
+                for (int i = 1; i < (num_threads + 1); i++)
+                  {
+                    borders[i] = borderstart;
+                    borderstart += chunk_length;
+                  }
+                borders[num_threads + 1] = n;
+              }
+
+            sums = static_cast<value_type*>(
+                ::operator new(sizeof(value_type) * num_threads));
+            OutputIterator target_end;
+          } //single
+
+        int iam = omp_get_thread_num();
+        if (iam == 0)
+          {
+            *result = *begin;
+            parallel_partial_sum_basecase(begin + 1, begin + borders[1],
+                          result + 1, bin_op, *begin);
+            sums[0] = *(result + borders[1] - 1);
+          }
+        else
+          {
+            sums[iam] = std::accumulate(begin + borders[iam] + 1,
+                            begin + borders[iam + 1],
+                            *(begin + borders[iam]),
+                            bin_op, __gnu_parallel::sequential_tag());
+          }
+
+#       pragma omp barrier
+
+#       pragma omp single
+          parallel_partial_sum_basecase(
+              sums + 1, sums + num_threads, sums + 1, bin_op, sums[0]);
+
+#       pragma omp barrier
+
+        // Still same team.
+        parallel_partial_sum_basecase(begin + borders[iam + 1],
+                      begin + borders[iam + 2],
+                      result + borders[iam + 1], bin_op,
+                      sums[iam]);
+      } //parallel
+
+    delete[] sums;
+    delete[] borders;
 
     return result + n;
   }
 
-  /** @brief Parallel partial sum front-end.
-   *  @param begin Begin iterator of input sequence.
-   *  @param end End iterator of input sequence.
-   *  @param result Begin iterator of output sequence.
-   *  @param bin_op Associative binary function.
-   *  @return End iterator of output sequence. */
-  template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
+/** @brief Parallel partial sum front-end.
+  *  @param begin Begin iterator of input sequence.
+  *  @param end End iterator of input sequence.
+  *  @param result Begin iterator of output sequence.
+  *  @param bin_op Associative binary function.
+  *  @return End iterator of output sequence. */
+template<
+    typename InputIterator,
+    typename OutputIterator,
+    typename BinaryOperation>
   OutputIterator
   parallel_partial_sum(InputIterator begin, InputIterator end,
-                      OutputIterator result, BinaryOperation bin_op)
+                       OutputIterator result, BinaryOperation bin_op)
   {
-    _GLIBCXX_CALL(begin - end);
+    _GLIBCXX_CALL(begin - end)
 
     typedef std::iterator_traits<InputIterator> traits_type;
     typedef typename traits_type::value_type value_type;
@@ -179,18 +205,15 @@ namespace __gnu_parallel
 
     difference_type n = end - begin;
 
-    int num_threads = get_max_threads();
-
     switch (Settings::partial_sum_algorithm)
       {
       case Settings::LINEAR:
-       // Need an initial offset.
-       return parallel_partial_sum_linear(begin, end, result, bin_op,
-                                          n, num_threads);
+        // Need an initial offset.
+        return parallel_partial_sum_linear(begin, end, result, bin_op, n);
       default:
-       // Partial_sum algorithm not implemented.
-       _GLIBCXX_PARALLEL_ASSERT(0);
-       return result + n;
+    // Partial_sum algorithm not implemented.
+        _GLIBCXX_PARALLEL_ASSERT(0);
+        return result + n;
       }
   }
 }
index 2b8631d..d6dac37 100644 (file)
 #include <bits/stl_algo.h>
 #include <parallel/parallel.h>
 
-/** @brief Decide whether to declare certain variable volatile in this file. */
+/** @brief Decide whether to declare certain variables volatile. */
 #define _GLIBCXX_VOLATILE volatile
 
 namespace __gnu_parallel
 {
-  /** @brief Parallel implementation of std::partition.
-   *  @param begin Begin iterator of input sequence to split.
-   *  @param end End iterator of input sequence to split.
-   *  @param pred Partition predicate, possibly including some kind of pivot.
-   *  @param max_num_threads Maximum number of threads to use for this task.
-   *  @return Number of elements not fulfilling the predicate. */
-  template<typename RandomAccessIterator, typename Predicate>
-  inline typename std::iterator_traits<RandomAccessIterator>::difference_type
+/** @brief Parallel implementation of std::partition.
+  *  @param begin Begin iterator of input sequence to split.
+  *  @param end End iterator of input sequence to split.
+  *  @param pred Partition predicate, possibly including some kind of pivot.
+  *  @param num_threads Maximum number of threads to use for this task.
+  *  @return Number of elements not fulfilling the predicate. */
+template<typename RandomAccessIterator, typename Predicate>
+  typename std::iterator_traits<RandomAccessIterator>::difference_type
   parallel_partition(RandomAccessIterator begin, RandomAccessIterator end,
-                    Predicate pred, thread_index_t max_num_threads)
+                     Predicate pred, thread_index_t num_threads)
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
@@ -74,212 +74,238 @@ namespace __gnu_parallel
     _GLIBCXX_VOLATILE difference_type leftover_left, leftover_right;
     _GLIBCXX_VOLATILE difference_type leftnew, rightnew;
 
-    bool* reserved_left, * reserved_right;
-
-    reserved_left = new bool[max_num_threads];
-    reserved_right = new bool[max_num_threads];
+    bool* reserved_left = NULL, * reserved_right = NULL;
 
     difference_type chunk_size;
-    if (Settings::partition_chunk_share > 0.0)
-      chunk_size = std::max((difference_type)Settings::partition_chunk_size, (difference_type)((double)n * Settings::partition_chunk_share / (double)max_num_threads));
-    else
-      chunk_size = Settings::partition_chunk_size;
 
     omp_lock_t result_lock;
     omp_init_lock(&result_lock);
 
-    // At least good for two processors.
-    while (right - left + 1 >= 2 * max_num_threads * chunk_size)
+    //at least two chunks per thread
+    if(right - left + 1 >= 2 * num_threads * chunk_size)
+#   pragma omp parallel num_threads(num_threads)
       {
-       difference_type num_chunks = (right - left + 1) / chunk_size;
-       thread_index_t num_threads = (int)std::min((difference_type)max_num_threads, num_chunks / 2);
-
-       for (int r = 0; r < num_threads; r++)
-         {
-           reserved_left[r] = false;
-           reserved_right[r] = false;
-         }
-       leftover_left = 0;
-       leftover_right = 0;
-
-#pragma omp parallel num_threads(num_threads)
-       {
-         // Private.
-         difference_type thread_left, thread_left_border, thread_right, thread_right_border;
-         thread_left = left + 1;
-
-         // Just to satisfy the condition below.
-         thread_left_border = thread_left - 1;
-         thread_right = n - 1;
-         thread_right_border = thread_right + 1;
-
-         bool iam_finished = false;
-         while (!iam_finished)
-           {
-             if (thread_left > thread_left_border)
-               {
-                  omp_set_lock(&result_lock);
-                 if (left + (chunk_size - 1) > right)
-                   iam_finished = true;
-                 else
-                   {
-                     thread_left = left;
-                     thread_left_border = left + (chunk_size - 1);
-                     left += chunk_size;
-                   }
-                  omp_unset_lock(&result_lock);
-               }
-
-             if (thread_right < thread_right_border)
-               {
-                  omp_set_lock(&result_lock);
-                 if (left > right - (chunk_size - 1))
-                   iam_finished = true;
-                 else
-                   {
-                     thread_right = right;
-                     thread_right_border = right - (chunk_size - 1);
-                     right -= chunk_size;
-                   }
-                  omp_unset_lock(&result_lock);
-               }
-
-             if (iam_finished)
-               break;
-
-             // Swap as usual.
-             while (thread_left < thread_right)
-               {
-                 while (pred(begin[thread_left]) && thread_left <= thread_left_border)
-                   thread_left++;
-                 while (!pred(begin[thread_right]) && thread_right >= thread_right_border)
-                   thread_right--;
-
-                 if (thread_left > thread_left_border || thread_right < thread_right_border)
-                   // Fetch new chunk(s).
-                   break;
-
-                 std::swap(begin[thread_left], begin[thread_right]);
-                 thread_left++;
-                 thread_right--;
-               }
-           }
-
-         // Now swap the leftover chunks to the right places.
-         if (thread_left <= thread_left_border)
-#pragma omp atomic
-           leftover_left++;
-         if (thread_right >= thread_right_border)
-#pragma omp atomic
-           leftover_right++;
-
-#pragma omp barrier
-
-#pragma omp single
-         {
-           leftnew = left - leftover_left * chunk_size;
-           rightnew = right + leftover_right * chunk_size;
-         }
-
-#pragma omp barrier
-
-         // <=> thread_left_border + (chunk_size - 1) >= leftnew
-         if (thread_left <= thread_left_border
-             && thread_left_border >= leftnew)
-           {
-             // Chunk already in place, reserve spot.
-             reserved_left[(left - (thread_left_border + 1)) / chunk_size] = true;
-           }
-
-         // <=> thread_right_border - (chunk_size - 1) <= rightnew
-         if (thread_right >= thread_right_border
-             && thread_right_border <= rightnew)
-           {
-             // Chunk already in place, reserve spot.
-             reserved_right[((thread_right_border - 1) - right) / chunk_size] = true;
-           }
-
-#pragma omp barrier
-
-         if (thread_left <= thread_left_border && thread_left_border < leftnew)
-           {
-             // Find spot and swap.
-             difference_type swapstart = -1;
-              omp_set_lock(&result_lock);
-             for (int r = 0; r < leftover_left; r++)
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+            reserved_left = new bool[num_threads];
+            reserved_right = new bool[num_threads];
+
+            if (Settings::partition_chunk_share > 0.0)
+              chunk_size = std::max<difference_type>(
+                  Settings::partition_chunk_size,
+                  (double)n * Settings::partition_chunk_share /
+                        (double)num_threads);
+            else
+              chunk_size = Settings::partition_chunk_size;
+          }
+
+        while (right - left + 1 >= 2 * num_threads * chunk_size)
+          {
+#           pragma omp single
+              {
+                difference_type num_chunks = (right - left + 1) / chunk_size;
+
+                for (int r = 0; r < num_threads; r++)
+                  {
+                    reserved_left[r] = false;
+                    reserved_right[r] = false;
+                  }
+                leftover_left = 0;
+                leftover_right = 0;
+              } //implicit barrier
+
+            // Private.
+            difference_type thread_left, thread_left_border,
+                            thread_right, thread_right_border;
+            thread_left = left + 1;
+
+            // Just to satisfy the condition below.
+            thread_left_border = thread_left - 1;
+            thread_right = n - 1;
+            thread_right_border = thread_right + 1;
+
+            bool iam_finished = false;
+            while (!iam_finished)
+              {
+                if (thread_left > thread_left_border)
+                  {
+                    omp_set_lock(&result_lock);
+                    if (left + (chunk_size - 1) > right)
+                      iam_finished = true;
+                    else
+                      {
+                        thread_left = left;
+                        thread_left_border = left + (chunk_size - 1);
+                        left += chunk_size;
+                      }
+                    omp_unset_lock(&result_lock);
+                  }
+
+                if (thread_right < thread_right_border)
+                  {
+                    omp_set_lock(&result_lock);
+                    if (left > right - (chunk_size - 1))
+                      iam_finished = true;
+                    else
+                      {
+                        thread_right = right;
+                        thread_right_border = right - (chunk_size - 1);
+                        right -= chunk_size;
+                      }
+                    omp_unset_lock(&result_lock);
+                  }
+
+                if (iam_finished)
+                  break;
+
+                // Swap as usual.
+                while (thread_left < thread_right)
+                  {
+                    while (pred(begin[thread_left])
+                            && thread_left <= thread_left_border)
+                      thread_left++;
+                    while (!pred(begin[thread_right])
+                            && thread_right >= thread_right_border)
+                      thread_right--;
+
+                    if (thread_left > thread_left_border
+                        || thread_right < thread_right_border)
+                      // Fetch new chunk(s).
+                      break;
+
+                    std::swap(begin[thread_left], begin[thread_right]);
+                    thread_left++;
+                    thread_right--;
+                  }
+              }
+
+            // Now swap the leftover chunks to the right places.
+            if (thread_left <= thread_left_border)
+#             pragma omp atomic
+              leftover_left++;
+            if (thread_right >= thread_right_border)
+#             pragma omp atomic
+              leftover_right++;
+
+#           pragma omp barrier
+
+#           pragma omp single
+              {
+                leftnew = left - leftover_left * chunk_size;
+                rightnew = right + leftover_right * chunk_size;
+              }
+
+#           pragma omp barrier
+
+            // <=> thread_left_border + (chunk_size - 1) >= leftnew
+            if (thread_left <= thread_left_border
+                && thread_left_border >= leftnew)
+              {
+                // Chunk already in place, reserve spot.
+                reserved_left[(left - (thread_left_border + 1)) / chunk_size]
+                    = true;
+              }
+
+            // <=> thread_right_border - (chunk_size - 1) <= rightnew
+            if (thread_right >= thread_right_border
+                && thread_right_border <= rightnew)
+              {
+                // Chunk already in place, reserve spot.
+                reserved_right
+                    [((thread_right_border - 1) - right) / chunk_size]
+                    = true;
+              }
+
+#           pragma omp barrier
+
+            if (thread_left <= thread_left_border
+                && thread_left_border < leftnew)
+              {
+                // Find spot and swap.
+                difference_type swapstart = -1;
+                omp_set_lock(&result_lock);
+                for (int r = 0; r < leftover_left; r++)
                   if (!reserved_left[r])
                     {
                       reserved_left[r] = true;
                       swapstart = left - (r + 1) * chunk_size;
                       break;
                     }
-              omp_unset_lock(&result_lock);
+                omp_unset_lock(&result_lock);
 
 #if _GLIBCXX_ASSERTIONS
-             _GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
+                _GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
 #endif
 
-             std::swap_ranges(begin + thread_left_border - (chunk_size - 1), begin + thread_left_border + 1, begin + swapstart);
-           }
-
-         if (thread_right >= thread_right_border
-             && thread_right_border > rightnew)
-           {
-             // Find spot and swap
-             difference_type swapstart = -1;
-              omp_set_lock(&result_lock);
-             for (int r = 0; r < leftover_right; r++)
-                 if (!reserved_right[r])
-                   {
-                     reserved_right[r] = true;
-                     swapstart = right + r * chunk_size + 1;
-                     break;
-                   }
-              omp_unset_lock(&result_lock);
+                std::swap_ranges(
+                    begin + thread_left_border - (chunk_size - 1),
+                    begin + thread_left_border + 1,
+                    begin + swapstart);
+              }
+
+            if (thread_right >= thread_right_border
+                && thread_right_border > rightnew)
+              {
+                // Find spot and swap
+                difference_type swapstart = -1;
+                omp_set_lock(&result_lock);
+                for (int r = 0; r < leftover_right; r++)
+                  if (!reserved_right[r])
+                    {
+                      reserved_right[r] = true;
+                      swapstart = right + r * chunk_size + 1;
+                      break;
+                    }
+                omp_unset_lock(&result_lock);
 
 #if _GLIBCXX_ASSERTIONS
-             _GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
+                _GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
 #endif
 
-             std::swap_ranges(begin + thread_right_border, begin + thread_right_border + chunk_size, begin + swapstart);
-           }
+                std::swap_ranges(begin + thread_right_border,
+                                begin + thread_right_border + chunk_size,
+                                begin + swapstart);
+              }
 #if _GLIBCXX_ASSERTIONS
-#pragma omp barrier
+#             pragma omp barrier
 
-#pragma omp single
-         {
-           for (int r = 0; r < leftover_left; r++)
-             _GLIBCXX_PARALLEL_ASSERT(reserved_left[r]);
-           for (int r = 0; r < leftover_right; r++)
-             _GLIBCXX_PARALLEL_ASSERT(reserved_right[r]);
-         }
+#             pragma omp single
+                {
+                  for (int r = 0; r < leftover_left; r++)
+                    _GLIBCXX_PARALLEL_ASSERT(reserved_left[r]);
+                  for (int r = 0; r < leftover_right; r++)
+                    _GLIBCXX_PARALLEL_ASSERT(reserved_right[r]);
+                }
 
-#pragma omp barrier
+#             pragma omp barrier
 #endif
 
-#pragma omp barrier
-         left = leftnew;
-         right = rightnew;
-       }
-      }        // end "recursion"
+#             pragma omp barrier
+
+              left = leftnew;
+              right = rightnew;
+          }
+#         pragma omp flush(left, right)
+      } // end "recursion" //parallel
 
     difference_type final_left = left, final_right = right;
 
     while (final_left < final_right)
       {
-       // Go right until key is geq than pivot.
-       while (pred(begin[final_left]) && final_left < final_right)
-         final_left++;
-
-       // Go left until key is less than pivot.
-       while (!pred(begin[final_right]) && final_left < final_right)
-         final_right--;
-
-       if (final_left == final_right)
-         break;
-       std::swap(begin[final_left], begin[final_right]);
-       final_left++;
-       final_right--;
+        // Go right until key is geq than pivot.
+        while (pred(begin[final_left]) && final_left < final_right)
+          final_left++;
+
+        // Go left until key is less than pivot.
+        while (!pred(begin[final_right]) && final_left < final_right)
+          final_right--;
+
+        if (final_left == final_right)
+          break;
+        std::swap(begin[final_left], begin[final_right]);
+        final_left++;
+        final_right--;
       }
 
     // All elements on the left side are < piv, all elements on the
@@ -298,14 +324,14 @@ namespace __gnu_parallel
       return final_left + 1;
   }
 
-  /** 
-   *  @brief Parallel implementation of std::nth_element().
-   *  @param begin Begin iterator of input sequence.
-   *  @param nth Iterator of element that must be in position afterwards.
-   *  @param end End iterator of input sequence.
-   *  @param comp Comparator. 
-   */
-  template<typename RandomAccessIterator, typename Comparator>
+/**
+  *  @brief Parallel implementation of std::nth_element().
+  *  @param begin Begin iterator of input sequence.
+  *  @param nth Iterator of element that must be in position afterwards.
+  *  @param end End iterator of input sequence.
+  *  @param comp Comparator.
+  */
+template<typename RandomAccessIterator, typename Comparator>
   void 
   parallel_nth_element(RandomAccessIterator begin, RandomAccessIterator nth, 
                       RandomAccessIterator end, Comparator comp)
@@ -324,65 +350,65 @@ namespace __gnu_parallel
     // Break if input range to small.
     while (static_cast<sequence_index_t>(end - begin) >= minimum_length)
       {
-       difference_type n = end - begin;
-
-       RandomAccessIterator pivot_pos = begin +  rng(n);
-
-       // Swap pivot_pos value to end.
-       if (pivot_pos != (end - 1))
-         std::swap(*pivot_pos, *(end - 1));
-       pivot_pos = end - 1;
-
-       // XXX Comparator must have first_value_type, second_value_type, result_type
-       // Comparator == __gnu_parallel::lexicographic<S, int, __gnu_parallel::less<S, S> > 
-       // pivot_pos == std::pair<S, int>*
-       // XXX binder2nd only for RandomAccessIterators??
-       __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos);
-
-       // Divide, leave pivot unchanged in last place.
-       RandomAccessIterator split_pos1, split_pos2;
-       split_pos1 = begin + parallel_partition(begin, end - 1, pred, get_max_threads());
-
-       // Left side: < pivot_pos; right side: >= pivot_pos
-
-       // Swap pivot back to middle.
-       if (split_pos1 != pivot_pos)
-         std::swap(*split_pos1, *pivot_pos);
-       pivot_pos = split_pos1;
-
-       // In case all elements are equal, split_pos1 == 0
-       if ((split_pos1 + 1 - begin) < (n >> 7) || (end - split_pos1) < (n >> 7))
-         {
-           // Very unequal split, one part smaller than one 128th
-           // elements not stricly larger than the pivot.
-           __gnu_parallel::unary_negate<__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>, value_type> pred(__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>(comp, *pivot_pos));
-
-           // Find other end of pivot-equal range.
-           split_pos2 = __gnu_sequential::partition(split_pos1 + 1, end, pred);
-         }
-       else
-         // Only skip the pivot.
-         split_pos2 = split_pos1 + 1;
-
-       // Compare iterators.
-       if (split_pos2 <= nth)
-         begin = split_pos2;
-       else if (nth < split_pos1)
-         end = split_pos1;
-       else
-         break;
+        difference_type n = end - begin;
+
+        RandomAccessIterator pivot_pos = begin +  rng(n);
+
+        // Swap pivot_pos value to end.
+        if (pivot_pos != (end - 1))
+          std::swap(*pivot_pos, *(end - 1));
+        pivot_pos = end - 1;
+
+        // XXX Comparator must have first_value_type, second_value_type, result_type
+        // Comparator == __gnu_parallel::lexicographic<S, int, __gnu_parallel::less<S, S> >
+        // pivot_pos == std::pair<S, int>*
+        // XXX binder2nd only for RandomAccessIterators??
+        __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos);
+
+        // Divide, leave pivot unchanged in last place.
+        RandomAccessIterator split_pos1, split_pos2;
+        split_pos1 = begin + parallel_partition(begin, end - 1, pred, get_max_threads());
+
+        // Left side: < pivot_pos; right side: >= pivot_pos
+
+        // Swap pivot back to middle.
+        if (split_pos1 != pivot_pos)
+          std::swap(*split_pos1, *pivot_pos);
+        pivot_pos = split_pos1;
+
+        // In case all elements are equal, split_pos1 == 0
+        if ((split_pos1 + 1 - begin) < (n >> 7) || (end - split_pos1) < (n >> 7))
+          {
+            // Very unequal split, one part smaller than one 128th
+            // elements not stricly larger than the pivot.
+            __gnu_parallel::unary_negate<__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>, value_type> pred(__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>(comp, *pivot_pos));
+
+            // Find other end of pivot-equal range.
+            split_pos2 = __gnu_sequential::partition(split_pos1 + 1, end, pred);
+          }
+        else
+          // Only skip the pivot.
+          split_pos2 = split_pos1 + 1;
+
+        // Compare iterators.
+        if (split_pos2 <= nth)
+          begin = split_pos2;
+        else if (nth < split_pos1)
+          end = split_pos1;
+        else
+          break;
       }
 
     // Only at most Settings::partition_minimal_n elements left.
     __gnu_sequential::sort(begin, end, comp);
   }
 
-  /** @brief Parallel implementation of std::partial_sort().
-   *  @param begin Begin iterator of input sequence.
-   *  @param middle Sort until this position.
-   *  @param end End iterator of input sequence.
-   *  @param comp Comparator. */
-  template<typename RandomAccessIterator, typename Comparator>
+/** @brief Parallel implementation of std::partial_sort().
+*  @param begin Begin iterator of input sequence.
+*  @param middle Sort until this position.
+*  @param end End iterator of input sequence.
+*  @param comp Comparator. */
+template<typename RandomAccessIterator, typename Comparator>
   void
   parallel_partial_sort(RandomAccessIterator begin, RandomAccessIterator middle, RandomAccessIterator end, Comparator comp)
   {
@@ -390,7 +416,7 @@ namespace __gnu_parallel
     std::sort(begin, middle, comp);
   }
 
-}      //namespace __gnu_parallel
+} //namespace __gnu_parallel
 
 #undef _GLIBCXX_VOLATILE
 
index a9ceab4..4eb3578 100644 (file)
@@ -53,11 +53,17 @@ namespace __gnu_parallel
    *  this part.
    */
   template<typename RandomAccessIterator, typename Comparator>
-  inline typename std::iterator_traits<RandomAccessIterator>::difference_type
-  parallel_sort_qs_divide(RandomAccessIterator begin, RandomAccessIterator end,
-                         Comparator comp,
-                         typename std::iterator_traits<RandomAccessIterator>::difference_type pivot_rank,
-                         typename std::iterator_traits<RandomAccessIterator>::difference_type num_samples, thread_index_t num_threads)
+  inline
+  typename std::iterator_traits<RandomAccessIterator>::difference_type
+  parallel_sort_qs_divide(
+      RandomAccessIterator begin,
+      RandomAccessIterator end,
+      Comparator comp,
+      typename std::iterator_traits<RandomAccessIterator>::difference_type
+          pivot_rank,
+      typename std::iterator_traits<RandomAccessIterator>::difference_type
+          num_samples,
+      thread_index_t num_threads)
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
@@ -65,20 +71,24 @@ namespace __gnu_parallel
 
     difference_type n = end - begin;
     num_samples = std::min(num_samples, n);
-    value_type* samples = static_cast<value_type*>(__builtin_alloca(sizeof(value_type) * num_samples));
+
+    // Allocate uninitialized, to avoid default constructor.
+    value_type* samples = static_cast<value_type*>(
+        operator new(num_samples * sizeof(value_type)));
 
     for (difference_type s = 0; s < num_samples; s++)
       {
-       const unsigned long long index = static_cast<unsigned long long>(s) 
-                                        * n / num_samples;
-       samples[s] = begin[index];
+        const unsigned long long index = static_cast<unsigned long long>(s)
+                        * n / num_samples;
+        new(samples + s) value_type(begin[index]);
       }
 
     __gnu_sequential::sort(samples, samples + num_samples, comp);
 
     value_type& pivot = samples[pivot_rank * num_samples / n];
 
-    __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, pivot);
+    __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool>
+        pred(comp, pivot);
     difference_type split = parallel_partition(begin, end, pred, num_threads);
 
     return split;
@@ -93,7 +103,10 @@ namespace __gnu_parallel
    */
   template<typename RandomAccessIterator, typename Comparator>
   inline void
-  parallel_sort_qs_conquer(RandomAccessIterator begin, RandomAccessIterator end, Comparator comp, int num_threads)
+  parallel_sort_qs_conquer(RandomAccessIterator begin,
+                           RandomAccessIterator end,
+                           Comparator comp,
+                           thread_index_t num_threads)
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
@@ -101,8 +114,8 @@ namespace __gnu_parallel
 
     if (num_threads <= 1)
       {
-       __gnu_sequential::sort(begin, end, comp);
-       return;
+        __gnu_sequential::sort(begin, end, comp);
+        return;
       }
 
     difference_type n = end - begin, pivot_rank;
@@ -110,24 +123,27 @@ namespace __gnu_parallel
     if (n <= 1)
       return;
 
-    thread_index_t num_processors_left;
+    thread_index_t num_threads_left;
 
     if ((num_threads % 2) == 1)
-      num_processors_left = num_threads / 2 + 1;
+      num_threads_left = num_threads / 2 + 1;
     else
-      num_processors_left = num_threads / 2;
+      num_threads_left = num_threads / 2;
 
-    pivot_rank = n * num_processors_left / num_threads;
+    pivot_rank = n * num_threads_left / num_threads;
 
-    difference_type split = parallel_sort_qs_divide(begin, end, comp, pivot_rank,
-Settings::sort_qs_num_samples_preset, num_threads);
+    difference_type split = parallel_sort_qs_divide(
+        begin, end, comp, pivot_rank,
+        Settings::sort_qs_num_samples_preset, num_threads);
 
 #pragma omp parallel sections
     {
 #pragma omp section
-      parallel_sort_qs_conquer(begin, begin + split, comp, num_processors_left);
+      parallel_sort_qs_conquer(begin, begin + split,
+                               comp, num_threads_left);
 #pragma omp section
-      parallel_sort_qs_conquer(begin + split, end, comp, num_threads - num_processors_left);
+      parallel_sort_qs_conquer(begin + split, end,
+                               comp, num_threads - num_threads_left);
     }
   }
 
@@ -143,9 +159,12 @@ Settings::sort_qs_num_samples_preset, num_threads);
    */
   template<typename RandomAccessIterator, typename Comparator>
   inline void
-  parallel_sort_qs(RandomAccessIterator begin, RandomAccessIterator end,
-                  Comparator comp,
-                  typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads)
+  parallel_sort_qs(
+      RandomAccessIterator begin,
+      RandomAccessIterator end,
+      Comparator comp,
+      typename std::iterator_traits<RandomAccessIterator>::difference_type n,
+      int num_threads)
   {
     _GLIBCXX_CALL(n)
 
@@ -165,12 +184,9 @@ Settings::sort_qs_num_samples_preset, num_threads);
     // Hard to avoid.
     omp_set_num_threads(num_threads);
 
-    bool old_nested = (omp_get_nested() != 0);
-    omp_set_nested(true);
     parallel_sort_qs_conquer(begin, begin + n, comp, num_threads);
-    omp_set_nested(old_nested);
   }
 
-}      //namespace __gnu_parallel
+} //namespace __gnu_parallel
 
 #endif
index 933ab3e..d7a82fb 100644 (file)
 
 namespace __gnu_parallel
 {
-  /** @brief Type to hold the index of a bin.
-   *
-   *  Since many variables of this type are allocated, it should be
-   *  chosen as small as possible.
-   */
-  typedef unsigned short bin_index;
-
-  /** @brief Data known to every thread participating in
-      __gnu_parallel::parallel_random_shuffle(). */
-  template<typename RandomAccessIterator>
+/** @brief Type to hold the index of a bin.
+  *
+  *  Since many variables of this type are allocated, it should be
+  *  chosen as small as possible.
+  */
+typedef unsigned short bin_index;
+
+/** @brief Data known to every thread participating in
+    __gnu_parallel::parallel_random_shuffle(). */
+template<typename RandomAccessIterator>
   struct DRandomShufflingGlobalData
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
@@ -90,18 +90,15 @@ namespace __gnu_parallel
     : source(_source) { }
   };
 
-  /** @brief Local data for a thread participating in
-      __gnu_parallel::parallel_random_shuffle().
-   */
-  template<typename RandomAccessIterator, typename RandomNumberGenerator>
+/** @brief Local data for a thread participating in
+    __gnu_parallel::parallel_random_shuffle().
+  */
+template<typename RandomAccessIterator, typename RandomNumberGenerator>
   struct DRSSorterPU
   {
     /** @brief Number of threads participating in total. */
     int num_threads;
 
-    /** @brief Number of owning thread. */
-    int iam;
-
     /** @brief Begin index for bins taken care of by this thread. */
     bin_index bins_begin;
 
@@ -115,29 +112,29 @@ namespace __gnu_parallel
     DRandomShufflingGlobalData<RandomAccessIterator>* sd;
   };
 
-  /** @brief Generate a random number in @c [0,2^logp).
-   *  @param logp Logarithm (basis 2) of the upper range bound.
-   *  @param rng Random number generator to use.
-   */
-  template<typename RandomNumberGenerator>
+/** @brief Generate a random number in @c [0,2^logp).
+  *  @param logp Logarithm (basis 2) of the upper range bound.
+  *  @param rng Random number generator to use.
+  */
+template<typename RandomNumberGenerator>
   inline int
   random_number_pow2(int logp, RandomNumberGenerator& rng)
   { return rng.genrand_bits(logp); }
 
-  /** @brief Random shuffle code executed by each thread.
-   *  @param pus Array of thread-local data records. */
-  template<typename RandomAccessIterator, typename RandomNumberGenerator>
+/** @brief Random shuffle code executed by each thread.
+  *  @param pus Array of thread-local data records. */
+template<typename RandomAccessIterator, typename RandomNumberGenerator>
   inline void 
-  parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator, 
-                                RandomNumberGenerator>* pus)
+  parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator,
+                                 RandomNumberGenerator>* pus)
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
     typedef typename traits_type::difference_type difference_type;
 
-    DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[omp_get_thread_num()];
+    thread_index_t iam = omp_get_thread_num();
+    DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[iam];
     DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd;
-    thread_index_t iam = d->iam;
 
     // Indexing: dist[bin][processor]
     difference_type length = sd->starts[iam + 1] - sd->starts[iam];
@@ -156,35 +153,35 @@ namespace __gnu_parallel
     // First main loop.
     for (difference_type i = 0; i < length; i++)
       {
-       bin_index oracle = random_number_pow2(num_bits, rng);
-       oracles[i] = oracle;
+        bin_index oracle = random_number_pow2(num_bits, rng);
+        oracles[i] = oracle;
 
-       // To allow prefix (partial) sum.
-       dist[oracle + 1]++;
+        // To allow prefix (partial) sum.
+        dist[oracle + 1]++;
       }
 
     for (bin_index b = 0; b < sd->num_bins + 1; b++)
       sd->dist[b][iam + 1] = dist[b];
 
-#pragma omp barrier
+#   pragma omp barrier
 
-#pragma omp single
+#   pragma omp single
     {
       // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
       // total number of items in bin s
       for (bin_index s = 0; s < sd->num_bins; s++)
-       __gnu_sequential::partial_sum(sd->dist[s + 1],
-                                     sd->dist[s + 1] + d->num_threads + 1,
-                                     sd->dist[s + 1]);
+        __gnu_sequential::partial_sum(sd->dist[s + 1],
+                                      sd->dist[s + 1] + d->num_threads + 1,
+                                      sd->dist[s + 1]);
     }
 
-#pragma omp barrier
+#   pragma omp barrier
 
     sequence_index_t offset = 0, global_offset = 0;
     for (bin_index s = 0; s < d->bins_begin; s++)
       global_offset += sd->dist[s + 1][d->num_threads];
 
-#pragma omp barrier
+#   pragma omp barrier
 
     for (bin_index s = d->bins_begin; s < d->bins_end; s++)
       {
@@ -193,9 +190,10 @@ namespace __gnu_parallel
        offset = sd->dist[s + 1][d->num_threads];
       }
 
-    sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * offset));
+    sd->temporaries[iam] = static_cast<value_type*>(
+      ::operator new(sizeof(value_type) * offset));
 
-#pragma omp barrier
+#   pragma omp barrier
 
     // Draw local copies to avoid false sharing.
     for (bin_index b = 0; b < sd->num_bins + 1; b++)
@@ -211,11 +209,11 @@ namespace __gnu_parallel
     // Distribute according to oracles, second main loop.
     for (difference_type i = 0; i < length; i++)
       {
-       bin_index target_bin = oracles[i];
-       thread_index_t target_p = bin_proc[target_bin];
+        bin_index target_bin = oracles[i];
+        thread_index_t target_p = bin_proc[target_bin];
 
-       // Last column [d->num_threads] stays unchanged.
-       temporaries[target_p][dist[target_bin + 1]++] = *(source + i + start);
+        // Last column [d->num_threads] stays unchanged.
+        temporaries[target_p][dist[target_bin + 1]++] = *(source + i + start);
       }
 
     delete[] oracles;
@@ -223,23 +221,27 @@ namespace __gnu_parallel
     delete[] bin_proc;
     delete[] temporaries;
 
-#pragma omp barrier
+#   pragma omp barrier
 
     // Shuffle bins internally.
     for (bin_index b = d->bins_begin; b < d->bins_end; b++)
       {
-       value_type* begin = sd->temporaries[iam] + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
-         * end = sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
-       sequential_random_shuffle(begin, end, rng);
-       std::copy(begin, end, sd->source + global_offset + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
+        value_type* begin =
+                    sd->temporaries[iam] +
+                    ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
+                  * end =
+                    sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
+        sequential_random_shuffle(begin, end, rng);
+        std::copy(begin, end, sd->source + global_offset +
+            ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
       }
 
     delete[] sd->temporaries[iam];
   }
 
-  /** @brief Round up to the next greater power of 2.
-   *  @param x Integer to round up */
-  template<typename T>
+/** @brief Round up to the next greater power of 2.
+  *  @param x Integer to round up */
+template<typename T>
   T 
   round_up_to_pow2(T x)
   {
@@ -249,16 +251,21 @@ namespace __gnu_parallel
       return (T)1 << (log2(x - 1) + 1);
   }
 
-  /** @brief Main parallel random shuffle step.
-   *  @param begin Begin iterator of sequence.
-   *  @param end End iterator of sequence.
-   *  @param n Length of sequence.
-   *  @param num_threads Number of threads to use.
-   *  @param rng Random number generator to use.
-   */
-  template<typename RandomAccessIterator, typename RandomNumberGenerator>
+/** @brief Main parallel random shuffle step.
+  *  @param begin Begin iterator of sequence.
+  *  @param end End iterator of sequence.
+  *  @param n Length of sequence.
+  *  @param num_threads Number of threads to use.
+  *  @param rng Random number generator to use.
+  */
+template<typename RandomAccessIterator, typename RandomNumberGenerator>
   inline void
-  parallel_random_shuffle_drs(RandomAccessIterator begin, RandomAccessIterator end, typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads, RandomNumberGenerator& rng)
+  parallel_random_shuffle_drs(
+      RandomAccessIterator begin,
+      RandomAccessIterator end,
+      typename std::iterator_traits<RandomAccessIterator>::difference_type n,
+      thread_index_t num_threads,
+      RandomNumberGenerator& rng)
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
@@ -275,87 +282,99 @@ namespace __gnu_parallel
     // Try the L1 cache first.
 
     // Must fit into L1.
-    num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type))));
+    num_bins_cache = std::max<difference_type>(
+        1, n / (Settings::L1_cache_size_lb / sizeof(value_type)));
     num_bins_cache = round_up_to_pow2(num_bins_cache);
 
     // No more buckets than TLB entries, power of 2
     // Power of 2 and at least one element per bin, at most the TLB size.
-    num_bins = std::min(n, (difference_type)num_bins_cache);
+    num_bins = std::min<difference_type>(n, num_bins_cache);
 
 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
     // 2 TLB entries needed per bin.
-    num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
+    num_bins = std::min<difference_type>(Settings::TLB_size / 2, num_bins);
 #endif
     num_bins = round_up_to_pow2(num_bins);
 
     if (num_bins < num_bins_cache)
       {
 #endif
-       // Now try the L2 cache
-       // Must fit into L2
-       num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type)))));
-       num_bins_cache = round_up_to_pow2(num_bins_cache);
-
-       // No more buckets than TLB entries, power of 2.
-       num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache));
-       // Power of 2 and at least one element per bin, at most the TLB size.
+        // Now try the L2 cache
+        // Must fit into L2
+        num_bins_cache = static_cast<bin_index>(std::max<difference_type>(
+            1, n / (Settings::L2_cache_size / sizeof(value_type))));
+        num_bins_cache = round_up_to_pow2(num_bins_cache);
+
+        // No more buckets than TLB entries, power of 2.
+        num_bins = static_cast<bin_index>(
+            std::min(n, static_cast<difference_type>(num_bins_cache)));
+        // Power of 2 and at least one element per bin, at most the TLB size.
 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
-       // 2 TLB entries needed per bin.
-       num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
+        // 2 TLB entries needed per bin.
+        num_bins = std::min(
+            static_cast<difference_type>(Settings::TLB_size / 2), num_bins);
 #endif
-       num_bins = round_up_to_pow2(num_bins);
+          num_bins = round_up_to_pow2(num_bins);
 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
       }
 #endif
 
-    num_threads = std::min((bin_index)num_threads, (bin_index)num_bins);
+    num_threads = std::min<bin_index>(num_threads, num_bins);
 
     if (num_threads <= 1)
       return sequential_random_shuffle(begin, end, rng);
 
     DRandomShufflingGlobalData<RandomAccessIterator> sd(begin);
+    DRSSorterPU<RandomAccessIterator, random_number >* pus;
+    difference_type* starts;
 
-    DRSSorterPU<RandomAccessIterator, random_number >* pus = new DRSSorterPU<RandomAccessIterator, random_number >[num_threads];
-
-    sd.temporaries = new value_type*[num_threads];
-    //sd.oracles = new bin_index[n];
-    sd.dist = new difference_type*[num_bins + 1];
-    sd.bin_proc = new thread_index_t[num_bins];
-    for (bin_index b = 0; b < num_bins + 1; b++)
-      sd.dist[b] = new difference_type[num_threads + 1];
-    for (bin_index b = 0; b < (num_bins + 1); b++)
+#   pragma omp parallel num_threads(num_threads)
       {
-       sd.dist[0][0] = 0;
-       sd.dist[b][0] = 0;
-      }
-    difference_type* starts = sd.starts = new difference_type[num_threads + 1];
-    int bin_cursor = 0;
-    sd.num_bins = num_bins;
-    sd.num_bits = log2(num_bins);
-
-    difference_type chunk_length = n / num_threads, split = n % num_threads, start = 0;
-    int bin_chunk_length = num_bins / num_threads, bin_split = num_bins % num_threads;
-    for (int i = 0; i < num_threads; i++)
-      {
-       starts[i] = start;
-       start += (i < split) ? (chunk_length + 1) : chunk_length;
-       int j = pus[i].bins_begin = bin_cursor;
-
-       // Range of bins for this processor.
-       bin_cursor += (i < bin_split) ? (bin_chunk_length + 1) : bin_chunk_length;
-       pus[i].bins_end = bin_cursor;
-       for (; j < bin_cursor; j++)
-         sd.bin_proc[j] = i;
-       pus[i].num_threads = num_threads;
-       pus[i].iam = i;
-       pus[i].seed = rng(std::numeric_limits<uint32>::max());
-       pus[i].sd = &sd;
-      }
-    starts[num_threads] = start;
-
-    // Now shuffle in parallel.
-#pragma omp parallel num_threads(num_threads)
-    parallel_random_shuffle_drs_pu(pus);
+#       pragma omp single
+          {
+            pus = new DRSSorterPU<RandomAccessIterator, random_number>
+                [num_threads];
+
+            sd.temporaries = new value_type*[num_threads];
+            sd.dist = new difference_type*[num_bins + 1];
+            sd.bin_proc = new thread_index_t[num_bins];
+            for (bin_index b = 0; b < num_bins + 1; b++)
+              sd.dist[b] = new difference_type[num_threads + 1];
+            for (bin_index b = 0; b < (num_bins + 1); b++)
+              {
+                sd.dist[0][0] = 0;
+                sd.dist[b][0] = 0;
+              }
+            starts = sd.starts = new difference_type[num_threads + 1];
+            int bin_cursor = 0;
+            sd.num_bins = num_bins;
+            sd.num_bits = log2(num_bins);
+
+            difference_type chunk_length = n / num_threads,
+                            split = n % num_threads, start = 0;
+            difference_type bin_chunk_length = num_bins / num_threads,
+                            bin_split = num_bins % num_threads;
+            for (thread_index_t i = 0; i < num_threads; i++)
+              {
+                starts[i] = start;
+                start += (i < split) ? (chunk_length + 1) : chunk_length;
+                int j = pus[i].bins_begin = bin_cursor;
+
+                // Range of bins for this processor.
+                bin_cursor += (i < bin_split) ?
+                    (bin_chunk_length + 1) : bin_chunk_length;
+                pus[i].bins_end = bin_cursor;
+                for (; j < bin_cursor; j++)
+                  sd.bin_proc[j] = i;
+                pus[i].num_threads = num_threads;
+                pus[i].seed = rng(std::numeric_limits<uint32>::max());
+                pus[i].sd = &sd;
+              }
+            starts[num_threads] = start;
+          } //single
+      // Now shuffle in parallel.
+      parallel_random_shuffle_drs_pu(pus);
+    }
 
     delete[] starts;
     delete[] sd.bin_proc;
@@ -367,16 +386,16 @@ namespace __gnu_parallel
     delete[] pus;
   }
 
-  /** @brief Sequential cache-efficient random shuffle.
  *  @param begin Begin iterator of sequence.
  *  @param end End iterator of sequence.
  *  @param rng Random number generator to use.
  */
-  template<typename RandomAccessIterator, typename RandomNumberGenerator>
+/** @brief Sequential cache-efficient random shuffle.
+ *  @param begin Begin iterator of sequence.
+ *  @param end End iterator of sequence.
+ *  @param rng Random number generator to use.
+ */
+template<typename RandomAccessIterator, typename RandomNumberGenerator>
   inline void
   sequential_random_shuffle(RandomAccessIterator begin, 
-                           RandomAccessIterator end, 
-                           RandomNumberGenerator& rng)
+                            RandomAccessIterator end,
+                            RandomNumberGenerator& rng)
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
@@ -388,7 +407,9 @@ namespace __gnu_parallel
 
 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
     // Try the L1 cache first, must fit into L1.
-    num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type))));
+    num_bins_cache =
+        std::max<difference_type>
+            (1, n / (Settings::L1_cache_size_lb / sizeof(value_type)));
     num_bins_cache = round_up_to_pow2(num_bins_cache);
 
     // No more buckets than TLB entries, power of 2
@@ -403,19 +424,23 @@ namespace __gnu_parallel
     if (num_bins < num_bins_cache)
    &nbs