libgomp/iter.c

   1 /* Copyright (C) 2005, 2008, 2009 Free Software Foundation, Inc.
   2    Contributed by Richard Henderson <rth@redhat.com>.
   3
   4    This file is part of the GNU OpenMP Library (libgomp).
   5
   6    Libgomp is free software; you can redistribute it and/or modify it
   7    under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3, or (at your option)
   9    any later version.
  10
  11    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  12    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  13    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  14    more details.
  15
  16    Under Section 7 of GPL version 3, you are granted additional
  17    permissions described in the GCC Runtime Library Exception, version
  18    3.1, as published by the Free Software Foundation.
  19
  20    You should have received a copy of the GNU General Public License and
  21    a copy of the GCC Runtime Library Exception along with this program;
  22    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23    <http://www.gnu.org/licenses/>.  */
  24
  25 /* This file contains routines for managing work-share iteration, both
  26    for loops and sections.  */
  27
  28 #include "libgomp.h"
  29 #include <stdlib.h>
  30
  31
  32 /* This function implements the STATIC scheduling method.  The caller should
  33    iterate *pstart <= x < *pend.  Return zero if there are more iterations
  34    to perform; nonzero if not.  Return less than 0 if this thread had
  35    received the absolutely last iteration.  */
  36
  37 int
  38 gomp_iter_static_next (long *pstart, long *pend)
  39 {
  40   struct gomp_thread *thr = gomp_thread ();
  41   struct gomp_team *team = thr->ts.team;
  42   struct gomp_work_share *ws = thr->ts.work_share;
  43   unsigned long nthreads = team ? team->nthreads : 1;
  44
  45   if (thr->ts.static_trip == -1)
  46     return -1;
  47
  48   /* Quick test for degenerate teams and orphaned constructs.  */
  49   if (nthreads == 1)
  50     {
  51       *pstart = ws->next;
  52       *pend = ws->end;
  53       thr->ts.static_trip = -1;
  54       return ws->next == ws->end;
  55     }
  56
  57   /* We interpret chunk_size zero as "unspecified", which means that we
  58      should break up the iterations such that each thread makes only one
  59      trip through the outer loop.  */
  60   if (ws->chunk_size == 0)
  61     {
  62       unsigned long n, q, i;
  63       unsigned long s0, e0;
  64       long s, e;
  65
  66       if (thr->ts.static_trip > 0)
  67         return 1;
  68
  69       /* Compute the total number of iterations.  */
  70       s = ws->incr + (ws->incr > 0 ? -1 : 1);
  71       n = (ws->end - ws->next + s) / ws->incr;
  72       i = thr->ts.team_id;
  73
  74       /* Compute the "zero-based" start and end points.  That is, as
  75          if the loop began at zero and incremented by one.  */
  76       q = n / nthreads;
  77       q += (q * nthreads != n);
  78       s0 = q * i;
  79       e0 = s0 + q;
  80       if (e0 > n)
  81         e0 = n;
  82
  83       /* Notice when no iterations allocated for this thread.  */
  84       if (s0 >= e0)
  85         {
  86           thr->ts.static_trip = 1;
  87           return 1;
  88         }
  89
  90       /* Transform these to the actual start and end numbers.  */
  91       s = (long)s0 * ws->incr + ws->next;
  92       e = (long)e0 * ws->incr + ws->next;
  93
  94       *pstart = s;
  95       *pend = e;
  96       thr->ts.static_trip = (e0 == n ? -1 : 1);
  97       return 0;
  98     }
  99   else
 100     {
 101       unsigned long n, s0, e0, i, c;
 102       long s, e;
 103
 104       /* Otherwise, each thread gets exactly chunk_size iterations
 105          (if available) each time through the loop.  */
 106
 107       s = ws->incr + (ws->incr > 0 ? -1 : 1);
 108       n = (ws->end - ws->next + s) / ws->incr;
 109       i = thr->ts.team_id;
 110       c = ws->chunk_size;
 111
 112       /* Initial guess is a C sized chunk positioned nthreads iterations
 113          in, offset by our thread number.  */
 114       s0 = (thr->ts.static_trip * nthreads + i) * c;
 115       e0 = s0 + c;
 116
 117       /* Detect overflow.  */
 118       if (s0 >= n)
 119         return 1;
 120       if (e0 > n)
 121         e0 = n;
 122
 123       /* Transform these to the actual start and end numbers.  */
 124       s = (long)s0 * ws->incr + ws->next;
 125       e = (long)e0 * ws->incr + ws->next;
 126
 127       *pstart = s;
 128       *pend = e;
 129
 130       if (e0 == n)
 131         thr->ts.static_trip = -1;
 132       else
 133         thr->ts.static_trip++;
 134       return 0;
 135     }
 136 }
 137
 138
 139 /* This function implements the DYNAMIC scheduling method.  Arguments are
 140    as for gomp_iter_static_next.  This function must be called with ws->lock
 141    held.  */
 142
 143 bool
 144 gomp_iter_dynamic_next_locked (long *pstart, long *pend)
 145 {
 146   struct gomp_thread *thr = gomp_thread ();
 147   struct gomp_work_share *ws = thr->ts.work_share;
 148   long start, end, chunk, left;
 149
 150   start = ws->next;
 151   if (start == ws->end)
 152     return false;
 153
 154   chunk = ws->chunk_size;
 155   left = ws->end - start;
 156   if (ws->incr < 0)
 157     {
 158       if (chunk < left)
 159         chunk = left;
 160     }
 161   else
 162     {
 163       if (chunk > left)
 164         chunk = left;
 165     }
 166   end = start + chunk;
 167
 168   ws->next = end;
 169   *pstart = start;
 170   *pend = end;
 171   return true;
 172 }
 173
 174
 175 #ifdef HAVE_SYNC_BUILTINS
 176 /* Similar, but doesn't require the lock held, and uses compare-and-swap
 177    instead.  Note that the only memory value that changes is ws->next.  */
 178
 179 bool
 180 gomp_iter_dynamic_next (long *pstart, long *pend)
 181 {
 182   struct gomp_thread *thr = gomp_thread ();
 183   struct gomp_work_share *ws = thr->ts.work_share;
 184   long start, end, nend, chunk, incr;
 185
 186   end = ws->end;
 187   incr = ws->incr;
 188   chunk = ws->chunk_size;
 189
 190   if (__builtin_expect (ws->mode, 1))
 191     {
 192       long tmp = __sync_fetch_and_add (&ws->next, chunk);
 193       if (incr > 0)
 194         {
 195           if (tmp >= end)
 196             return false;
 197           nend = tmp + chunk;
 198           if (nend > end)
 199             nend = end;
 200           *pstart = tmp;
 201           *pend = nend;
 202           return true;
 203         }
 204       else
 205         {
 206           if (tmp <= end)
 207             return false;
 208           nend = tmp + chunk;
 209           if (nend < end)
 210             nend = end;
 211           *pstart = tmp;
 212           *pend = nend;
 213           return true;
 214         }
 215     }
 216
 217   start = ws->next;
 218   while (1)
 219     {
 220       long left = end - start;
 221       long tmp;
 222
 223       if (start == end)
 224         return false;
 225
 226       if (incr < 0)
 227         {
 228           if (chunk < left)
 229             chunk = left;
 230         }
 231       else
 232         {
 233           if (chunk > left)
 234             chunk = left;
 235         }
 236       nend = start + chunk;
 237
 238       tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
 239       if (__builtin_expect (tmp == start, 1))
 240         break;
 241
 242       start = tmp;
 243     }
 244
 245   *pstart = start;
 246   *pend = nend;
 247   return true;
 248 }
 249 #endif /* HAVE_SYNC_BUILTINS */
 250
 251
 252 /* This function implements the GUIDED scheduling method.  Arguments are
 253    as for gomp_iter_static_next.  This function must be called with the
 254    work share lock held.  */
 255
 256 bool
 257 gomp_iter_guided_next_locked (long *pstart, long *pend)
 258 {
 259   struct gomp_thread *thr = gomp_thread ();
 260   struct gomp_work_share *ws = thr->ts.work_share;
 261   struct gomp_team *team = thr->ts.team;
 262   unsigned long nthreads = team ? team->nthreads : 1;
 263   unsigned long n, q;
 264   long start, end;
 265
 266   if (ws->next == ws->end)
 267     return false;
 268
 269   start = ws->next;
 270   n = (ws->end - start) / ws->incr;
 271   q = (n + nthreads - 1) / nthreads;
 272
 273   if (q < ws->chunk_size)
 274     q = ws->chunk_size;
 275   if (q <= n)
 276     end = start + q * ws->incr;
 277   else
 278     end = ws->end;
 279
 280   ws->next = end;
 281   *pstart = start;
 282   *pend = end;
 283   return true;
 284 }
 285
 286 #ifdef HAVE_SYNC_BUILTINS
 287 /* Similar, but doesn't require the lock held, and uses compare-and-swap
 288    instead.  Note that the only memory value that changes is ws->next.  */
 289
 290 bool
 291 gomp_iter_guided_next (long *pstart, long *pend)
 292 {
 293   struct gomp_thread *thr = gomp_thread ();
 294   struct gomp_work_share *ws = thr->ts.work_share;
 295   struct gomp_team *team = thr->ts.team;
 296   unsigned long nthreads = team ? team->nthreads : 1;
 297   long start, end, nend, incr;
 298   unsigned long chunk_size;
 299
 300   start = ws->next;
 301   end = ws->end;
 302   incr = ws->incr;
 303   chunk_size = ws->chunk_size;
 304
 305   while (1)
 306     {
 307       unsigned long n, q;
 308       long tmp;
 309
 310       if (start == end)
 311         return false;
 312
 313       n = (end - start) / incr;
 314       q = (n + nthreads - 1) / nthreads;
 315
 316       if (q < chunk_size)
 317         q = chunk_size;
 318       if (__builtin_expect (q <= n, 1))
 319         nend = start + q * incr;
 320       else
 321         nend = end;
 322
 323       tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
 324       if (__builtin_expect (tmp == start, 1))
 325         break;
 326
 327       start = tmp;
 328     }
 329
 330   *pstart = start;
 331   *pend = nend;
 332   return true;
 333 }
 334 #endif /* HAVE_SYNC_BUILTINS */