1 /* Copyright (C) 2009, 2011 Free Software Foundation, Inc.
2 Contributed by Richard Henderson <rth@redhat.com>.
4 This file is part of the GNU Transactional Memory Library (libitm).
6 Libitm is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 Libitm is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 Under Section 7 of GPL version 3, you are granted additional
17 permissions described in the GCC Runtime Library Exception, version
18 3.1, as published by the Free Software Foundation.
20 You should have received a copy of the GNU General Public License and
21 a copy of the GCC Runtime Library Exception along with this program;
22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 <http://www.gnu.org/licenses/>. */
25 #ifndef LIBITM_CACHELINE_H
26 #define LIBITM_CACHELINE_H 1
28 // Minimum cacheline size is 32, due to both complex long double and __m256.
29 // There's no requirement that 64-bit use a 64-byte cacheline size, but do
30 // so for now to make sure everything is parameterized properly.
32 # define CACHELINE_SIZE 64
34 # define CACHELINE_SIZE 32
37 namespace GTM HIDDEN {
39 // A gtm_cacheline_mask stores a modified bit for every modified byte
40 // in the cacheline with which it is associated.
41 typedef sized_integral<CACHELINE_SIZE / 8>::type gtm_cacheline_mask;
43 extern uint32_t const gtm_bit_to_byte_mask[16];
47 // Byte access to the cacheline.
48 unsigned char b[CACHELINE_SIZE] __attribute__((aligned(CACHELINE_SIZE)));
50 // Larger sized access to the cacheline.
51 uint16_t u16[CACHELINE_SIZE / sizeof(uint16_t)];
52 uint32_t u32[CACHELINE_SIZE / sizeof(uint32_t)];
53 uint64_t u64[CACHELINE_SIZE / sizeof(uint64_t)];
54 gtm_word w[CACHELINE_SIZE / sizeof(gtm_word)];
57 __m64 m64[CACHELINE_SIZE / sizeof(__m64)];
60 __m128 m128[CACHELINE_SIZE / sizeof(__m128)];
63 __m128i m128i[CACHELINE_SIZE / sizeof(__m128i)];
66 __m256 m256[CACHELINE_SIZE / sizeof(__m256)];
67 __m256i m256i[CACHELINE_SIZE / sizeof(__m256i)];
70 // Store S into D, but only the bytes specified by M.
71 static void store_mask (uint32_t *d, uint32_t s, uint8_t m);
72 static void store_mask (uint64_t *d, uint64_t s, uint8_t m);
74 static void store_mask (__m128i *d, __m128i s, uint16_t m);
77 // Copy S to D, but only the bytes specified by M.
78 static void copy_mask (gtm_cacheline * __restrict d,
79 const gtm_cacheline * __restrict s,
80 gtm_cacheline_mask m);
82 // A write barrier to emit after (a series of) copy_mask.
83 // When we're emitting non-temporal stores, the normal strong
84 // ordering of the machine doesn't apply.
85 static void copy_mask_wb ();
87 #if defined(__SSE__) || defined(__AVX__)
88 // Copy S to D; only bother defining if we can do this more efficiently
89 // than the compiler-generated default implementation.
90 gtm_cacheline& operator= (const gtm_cacheline &s);
95 gtm_cacheline::copy_mask_wb ()
102 #if defined(__SSE__) || defined(__AVX__)
103 inline gtm_cacheline& ALWAYS_INLINE
104 gtm_cacheline::operator= (const gtm_cacheline & __restrict s)
116 // ??? Wouldn't it be nice to have a pragma to tell the compiler
117 // to completely unroll a given loop?
118 switch (CACHELINE_SIZE / sizeof(TYPE))
121 this->CP[0] = s.CP[0];
147 // ??? Support masked integer stores more efficiently with an unlocked cmpxchg
148 // insn. My reasoning is that while we write to locations that we do not wish
149 // to modify, we do it in an uninterruptable insn, and so we either truely
150 // write back the original data or the insn fails -- unlike with a
151 // load/and/or/write sequence which can be interrupted either by a kernel
152 // task switch or an unlucky cacheline steal by another processor. Avoiding
153 // the LOCK prefix improves performance by a factor of 10, and we don't need
154 // the memory barrier semantics implied by that prefix.
156 inline void ALWAYS_INLINE
157 gtm_cacheline::store_mask (uint32_t *d, uint32_t s, uint8_t m)
159 gtm_cacheline_mask tm = (1 << sizeof (s)) - 1;
160 if (__builtin_expect (m & tm, tm))
162 if (__builtin_expect ((m & tm) == tm, 1))
166 gtm_cacheline_mask bm = gtm_bit_to_byte_mask[m & 15];
173 "cmpxchg %[n], %[d]\n\t"
175 : [d] "+m"(*d), [n] "=&r" (n), [o] "+a"(o)
176 : [s] "r" (s & bm), [m] "r" (~bm));
181 inline void ALWAYS_INLINE
182 gtm_cacheline::store_mask (uint64_t *d, uint64_t s, uint8_t m)
184 gtm_cacheline_mask tm = (1 << sizeof (s)) - 1;
185 if (__builtin_expect (m & tm, tm))
187 if (__builtin_expect ((m & tm) == tm, 1))
192 uint32_t bl = gtm_bit_to_byte_mask[m & 15];
193 uint32_t bh = gtm_bit_to_byte_mask[(m >> 4) & 15];
194 gtm_cacheline_mask bm = bl | ((gtm_cacheline_mask)bh << 31 << 1);
200 "cmpxchg %[n], %[d]\n\t"
202 : [d] "+m"(*d), [n] "=&r" (n), [o] "+a"(o)
203 : [s] "r" (s & bm), [m] "r" (~bm));
205 /* ??? While it's possible to perform this operation with
206 cmpxchg8b, the sequence requires all 7 general registers
207 and thus cannot be performed with -fPIC. Don't even try. */
208 uint32_t *d32 = reinterpret_cast<uint32_t *>(d);
209 store_mask (d32, s, m);
210 store_mask (d32 + 1, s >> 32, m >> 4);
217 inline void ALWAYS_INLINE
218 gtm_cacheline::store_mask (__m128i *d, __m128i s, uint16_t m)
220 if (__builtin_expect (m == 0, 0))
222 if (__builtin_expect (m == 0xffff, 1))
226 __m128i bm0, bm1, bm2, bm3;
227 bm0 = _mm_set_epi32 (0, 0, 0, gtm_bit_to_byte_mask[m & 15]); m >>= 4;
228 bm1 = _mm_set_epi32 (0, 0, 0, gtm_bit_to_byte_mask[m & 15]); m >>= 4;
229 bm2 = _mm_set_epi32 (0, 0, 0, gtm_bit_to_byte_mask[m & 15]); m >>= 4;
230 bm3 = _mm_set_epi32 (0, 0, 0, gtm_bit_to_byte_mask[m & 15]); m >>= 4;
231 bm0 = _mm_unpacklo_epi32 (bm0, bm1);
232 bm2 = _mm_unpacklo_epi32 (bm2, bm3);
233 bm0 = _mm_unpacklo_epi64 (bm0, bm2);
235 _mm_maskmoveu_si128 (s, bm0, (char *)d);
242 #endif // LIBITM_CACHELINE_H