libitm/config/x86/cacheline.h

   1 /* Copyright (C) 2009, 2011 Free Software Foundation, Inc.
   2    Contributed by Richard Henderson <rth@redhat.com>.
   3
   4    This file is part of the GNU Transactional Memory Library (libitm).
   5
   6    Libitm is free software; you can redistribute it and/or modify it
   7    under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    Libitm is distributed in the hope that it will be useful, but WITHOUT ANY
  12    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  13    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  14    more details.
  15
  16    Under Section 7 of GPL version 3, you are granted additional
  17    permissions described in the GCC Runtime Library Exception, version
  18    3.1, as published by the Free Software Foundation.
  19
  20    You should have received a copy of the GNU General Public License and
  21    a copy of the GCC Runtime Library Exception along with this program;
  22    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23    <http://www.gnu.org/licenses/>.  */
  24
  25 #ifndef LIBITM_CACHELINE_H
  26 #define LIBITM_CACHELINE_H 1
  27
  28 // Minimum cacheline size is 32, due to both complex long double and __m256.
  29 // There's no requirement that 64-bit use a 64-byte cacheline size, but do
  30 // so for now to make sure everything is parameterized properly.
  31 #ifdef __x86_64__
  32 # define CACHELINE_SIZE 64
  33 #else
  34 # define CACHELINE_SIZE 32
  35 #endif
  36
  37 namespace GTM HIDDEN {
  38
  39 // A gtm_cacheline_mask stores a modified bit for every modified byte
  40 // in the cacheline with which it is associated.
  41 typedef sized_integral<CACHELINE_SIZE / 8>::type gtm_cacheline_mask;
  42
  43 extern uint32_t const gtm_bit_to_byte_mask[16];
  44
  45 union gtm_cacheline
  46 {
  47   // Byte access to the cacheline.
  48   unsigned char b[CACHELINE_SIZE] __attribute__((aligned(CACHELINE_SIZE)));
  49
  50   // Larger sized access to the cacheline.
  51   uint16_t u16[CACHELINE_SIZE / sizeof(uint16_t)];
  52   uint32_t u32[CACHELINE_SIZE / sizeof(uint32_t)];
  53   uint64_t u64[CACHELINE_SIZE / sizeof(uint64_t)];
  54   gtm_word w[CACHELINE_SIZE / sizeof(gtm_word)];
  55
  56 #ifdef __MMX__
  57   __m64 m64[CACHELINE_SIZE / sizeof(__m64)];
  58 #endif
  59 #ifdef __SSE__
  60   __m128 m128[CACHELINE_SIZE / sizeof(__m128)];
  61 #endif
  62 #ifdef __SSE2__
  63   __m128i m128i[CACHELINE_SIZE / sizeof(__m128i)];
  64 #endif
  65 #ifdef __AVX__
  66   __m256 m256[CACHELINE_SIZE / sizeof(__m256)];
  67   __m256i m256i[CACHELINE_SIZE / sizeof(__m256i)];
  68 #endif
  69
  70   // Store S into D, but only the bytes specified by M.
  71   static void store_mask (uint32_t *d, uint32_t s, uint8_t m);
  72   static void store_mask (uint64_t *d, uint64_t s, uint8_t m);
  73 #ifdef __SSE2__
  74   static void store_mask (__m128i *d, __m128i s, uint16_t m);
  75 #endif
  76
  77   // Copy S to D, but only the bytes specified by M.
  78   static void copy_mask (gtm_cacheline * __restrict d,
  79                          const gtm_cacheline * __restrict s,
  80                          gtm_cacheline_mask m);
  81
  82   // A write barrier to emit after (a series of) copy_mask.
  83   // When we're emitting non-temporal stores, the normal strong
  84   // ordering of the machine doesn't apply.
  85   static void copy_mask_wb ();
  86
  87 #if defined(__SSE__) || defined(__AVX__)
  88   // Copy S to D; only bother defining if we can do this more efficiently
  89   // than the compiler-generated default implementation.
  90   gtm_cacheline& operator= (const gtm_cacheline &s);
  91 #endif // SSE, AVX
  92 };
  93
  94 inline void
  95 gtm_cacheline::copy_mask_wb ()
  96 {
  97 #ifdef __SSE2__
  98   _mm_sfence ();
  99 #endif
 100 }
 101
 102 #if defined(__SSE__) || defined(__AVX__)
 103 inline gtm_cacheline& ALWAYS_INLINE
 104 gtm_cacheline::operator= (const gtm_cacheline & __restrict s)
 105 {
 106 #ifdef __AVX__
 107 # define CP     m256
 108 # define TYPE   __m256
 109 #else
 110 # define CP     m128
 111 # define TYPE   __m128
 112 #endif
 113
 114   TYPE w, x, y, z;
 115
 116   // ??? Wouldn't it be nice to have a pragma to tell the compiler
 117   // to completely unroll a given loop?
 118   switch (CACHELINE_SIZE / sizeof(TYPE))
 119     {
 120     case 1:
 121       this->CP[0] = s.CP[0];
 122       break;
 123     case 2:
 124       x = s.CP[0];
 125       y = s.CP[1];
 126       this->CP[0] = x;
 127       this->CP[1] = y;
 128       break;
 129     case 4:
 130       w = s.CP[0];
 131       x = s.CP[1];
 132       y = s.CP[2];
 133       z = s.CP[3];
 134       this->CP[0] = w;
 135       this->CP[1] = x;
 136       this->CP[2] = y;
 137       this->CP[3] = z;
 138       break;
 139     default:
 140       __builtin_trap ();
 141     }
 142
 143   return *this;
 144 }
 145 #endif
 146
 147 // ??? Support masked integer stores more efficiently with an unlocked cmpxchg
 148 // insn.  My reasoning is that while we write to locations that we do not wish
 149 // to modify, we do it in an uninterruptable insn, and so we either truely
 150 // write back the original data or the insn fails -- unlike with a
 151 // load/and/or/write sequence which can be interrupted either by a kernel
 152 // task switch or an unlucky cacheline steal by another processor.  Avoiding
 153 // the LOCK prefix improves performance by a factor of 10, and we don't need
 154 // the memory barrier semantics implied by that prefix.
 155
 156 inline void ALWAYS_INLINE
 157 gtm_cacheline::store_mask (uint32_t *d, uint32_t s, uint8_t m)
 158 {
 159   gtm_cacheline_mask tm = (1 << sizeof (s)) - 1;
 160   if (__builtin_expect (m & tm, tm))
 161     {
 162       if (__builtin_expect ((m & tm) == tm, 1))
 163         *d = s;
 164       else
 165         {
 166           gtm_cacheline_mask bm = gtm_bit_to_byte_mask[m & 15];
 167           gtm_word n, o = *d;
 168
 169           __asm("\n0:\t"
 170                 "mov    %[o], %[n]\n\t"
 171                 "and    %[m], %[n]\n\t"
 172                 "or     %[s], %[n]\n\t"
 173                 "cmpxchg %[n], %[d]\n\t"
 174                 "jnz,pn 0b"
 175                 : [d] "+m"(*d), [n] "=&r" (n), [o] "+a"(o)
 176                 : [s] "r" (s & bm), [m] "r" (~bm));
 177         }
 178     }
 179 }
 180
 181 inline void ALWAYS_INLINE
 182 gtm_cacheline::store_mask (uint64_t *d, uint64_t s, uint8_t m)
 183 {
 184   gtm_cacheline_mask tm = (1 << sizeof (s)) - 1;
 185   if (__builtin_expect (m & tm, tm))
 186     {
 187       if (__builtin_expect ((m & tm) == tm, 1))
 188         *d = s;
 189       else
 190         {
 191 #ifdef __x86_64__
 192           uint32_t bl = gtm_bit_to_byte_mask[m & 15];
 193           uint32_t bh = gtm_bit_to_byte_mask[(m >> 4) & 15];
 194           gtm_cacheline_mask bm = bl | ((gtm_cacheline_mask)bh << 31 << 1);
 195           uint64_t n, o = *d;
 196           __asm("\n0:\t"
 197                 "mov    %[o], %[n]\n\t"
 198                 "and    %[m], %[n]\n\t"
 199                 "or     %[s], %[n]\n\t"
 200                 "cmpxchg %[n], %[d]\n\t"
 201                 "jnz,pn 0b"
 202                 : [d] "+m"(*d), [n] "=&r" (n), [o] "+a"(o)
 203                 : [s] "r" (s & bm), [m] "r" (~bm));
 204 #else
 205           /* ??? While it's possible to perform this operation with
 206              cmpxchg8b, the sequence requires all 7 general registers
 207              and thus cannot be performed with -fPIC.  Don't even try.  */
 208           uint32_t *d32 = reinterpret_cast<uint32_t *>(d);
 209           store_mask (d32, s, m);
 210           store_mask (d32 + 1, s >> 32, m >> 4);
 211 #endif
 212         }
 213     }
 214 }
 215
 216 #ifdef __SSE2__
 217 inline void ALWAYS_INLINE
 218 gtm_cacheline::store_mask (__m128i *d, __m128i s, uint16_t m)
 219 {
 220   if (__builtin_expect (m == 0, 0))
 221     return;
 222   if (__builtin_expect (m == 0xffff, 1))
 223     *d = s;
 224   else
 225     {
 226       __m128i bm0, bm1, bm2, bm3;
 227       bm0 = _mm_set_epi32 (0, 0, 0, gtm_bit_to_byte_mask[m & 15]); m >>= 4;
 228       bm1 = _mm_set_epi32 (0, 0, 0, gtm_bit_to_byte_mask[m & 15]); m >>= 4;
 229       bm2 = _mm_set_epi32 (0, 0, 0, gtm_bit_to_byte_mask[m & 15]); m >>= 4;
 230       bm3 = _mm_set_epi32 (0, 0, 0, gtm_bit_to_byte_mask[m & 15]); m >>= 4;
 231       bm0 = _mm_unpacklo_epi32 (bm0, bm1);
 232       bm2 = _mm_unpacklo_epi32 (bm2, bm3);
 233       bm0 = _mm_unpacklo_epi64 (bm0, bm2);
 234
 235       _mm_maskmoveu_si128 (s, bm0, (char *)d);
 236     }
 237 }
 238 #endif // SSE2
 239
 240 } // namespace GTM
 241
 242 #endif // LIBITM_CACHELINE_H