libitm/config/x86/unaligned.h

   1 /* Copyright (C) 2009, 2011 Free Software Foundation, Inc.
   2    Contributed by Richard Henderson <rth@redhat.com>.
   3
   4    This file is part of the GNU Transactional Memory Library (libitm).
   5
   6    Libitm is free software; you can redistribute it and/or modify it
   7    under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    Libitm is distributed in the hope that it will be useful, but WITHOUT ANY
  12    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  13    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  14    more details.
  15
  16    Under Section 7 of GPL version 3, you are granted additional
  17    permissions described in the GCC Runtime Library Exception, version
  18    3.1, as published by the Free Software Foundation.
  19
  20    You should have received a copy of the GNU General Public License and
  21    a copy of the GCC Runtime Library Exception along with this program;
  22    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23    <http://www.gnu.org/licenses/>.  */
  24
  25 #ifndef LIBITM_X86_UNALIGNED_H
  26 #define LIBITM_X86_UNALIGNED_H 1
  27
  28 #define HAVE_ARCH_UNALIGNED_LOAD2_U4 1
  29 #define HAVE_ARCH_UNALIGNED_LOAD2_U8 1
  30
  31 #include "config/generic/unaligned.h"
  32
  33 namespace GTM HIDDEN {
  34
  35 template<>
  36 inline uint32_t
  37 unaligned_load2<uint32_t>(const gtm_cacheline *c1,
  38                           const gtm_cacheline *c2, size_t ofs)
  39 {
  40   uint32_t r, lo, hi;
  41   lo = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 1];
  42   hi = c2->u32[0];
  43   asm("shrd %b2, %1, %0" : "=r"(r) : "r"(hi), "c"((ofs & 3) * 8), "0"(lo));
  44   return r;
  45 }
  46
  47 template<>
  48 inline uint64_t
  49 unaligned_load2<uint64_t>(const gtm_cacheline *c1,
  50                           const gtm_cacheline *c2, size_t ofs)
  51 {
  52 #ifdef __x86_64__
  53   uint64_t r, lo, hi;
  54   lo = c1->u64[CACHELINE_SIZE / sizeof(uint64_t) - 1];
  55   hi = c2->u64[0];
  56   asm("shrd %b2, %1, %0" : "=r"(r) : "r"(hi), "c"((ofs & 3) * 8), "0"(lo));
  57   return r;
  58 #else
  59   uint32_t v0, v1, v2;
  60   uint64_t r;
  61
  62   if (ofs < CACHELINE_SIZE - 4)
  63     {
  64       v0 = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 2];
  65       v1 = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 1];
  66       v2 = c2->u32[0];
  67     }
  68   else
  69     {
  70       v0 = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 1];
  71       v1 = c2->u32[0];
  72       v2 = c2->u32[1];
  73     }
  74   ofs = (ofs & 3) * 8;
  75   asm("shrd %%cl, %[v1], %[v0]; shrd %%cl, %[v2], %[v1]"
  76       : "=A"(r) : "c"(ofs), [v0] "a"(v0), [v1] "d"(v1), [v2] "r"(v2));
  77
  78   return r;
  79 #endif
  80 }
  81
  82 #if defined(__SSE2__) || defined(__MMX__)
  83 template<>
  84 inline _ITM_TYPE_M64
  85 unaligned_load2<_ITM_TYPE_M64>(const gtm_cacheline *c1,
  86                                const gtm_cacheline *c2, size_t ofs)
  87 {
  88 # ifdef __x86_64__
  89   __m128i lo = _mm_movpi64_epi64 (c1->m64[CACHELINE_SIZE / 8 - 1]);
  90   __m128i hi = _mm_movpi64_epi64 (c2->m64[0]);
  91
  92   ofs = (ofs & 7) * 8;
  93   lo = _mm_srli_epi64 (lo, ofs);
  94   hi = _mm_slli_epi64 (hi, 64 - ofs);
  95   lo = lo | hi;
  96   return _mm_movepi64_pi64 (lo);
  97 # else
  98   // On 32-bit we're about to return the result in an MMX register, so go
  99   // ahead and do the computation in that unit, even if SSE2 is available.
 100   __m64 lo = c1->m64[CACHELINE_SIZE / 8 - 1];
 101   __m64 hi = c2->m64[0];
 102
 103   ofs = (ofs & 7) * 8;
 104   lo = _mm_srli_si64 (lo, ofs);
 105   hi = _mm_slli_si64 (hi, 64 - ofs);
 106   return lo | hi;
 107 # endif
 108 }
 109 #endif // SSE2 or MMX
 110
 111 // The SSE types are strictly aligned.
 112 #ifdef __SSE__
 113 template<>
 114   struct strict_alignment<_ITM_TYPE_M128>
 115     : public std::true_type
 116   { };
 117
 118 // Expand the unaligned SSE move instructions.
 119 template<>
 120 inline _ITM_TYPE_M128
 121 unaligned_load<_ITM_TYPE_M128>(const void *t)
 122 {
 123   return _mm_loadu_ps (static_cast<const float *>(t));
 124 }
 125
 126 template<>
 127 inline void
 128 unaligned_store<_ITM_TYPE_M128>(void *t, _ITM_TYPE_M128 val)
 129 {
 130   _mm_storeu_ps (static_cast<float *>(t), val);
 131 }
 132 #endif // SSE
 133
 134 #ifdef __AVX__
 135 // The AVX types are strictly aligned when it comes to vmovaps vs vmovups.
 136 template<>
 137   struct strict_alignment<_ITM_TYPE_M256>
 138     : public std::true_type
 139   { };
 140
 141 template<>
 142 inline _ITM_TYPE_M256
 143 unaligned_load<_ITM_TYPE_M256>(const void *t)
 144 {
 145   return _mm256_loadu_ps (static_cast<const float *>(t));
 146 }
 147
 148 template<>
 149 inline void
 150 unaligned_store<_ITM_TYPE_M256>(void *t, _ITM_TYPE_M256 val)
 151 {
 152   _mm256_storeu_ps (static_cast<float *>(t), val);
 153 }
 154 #endif // AVX
 155
 156 #ifdef __XOP__
 157 # define HAVE_ARCH_REALIGN_M128I 1
 158 extern const __v16qi GTM_vpperm_shift[16];
 159 inline __m128i
 160 realign_m128i (__m128i lo, __m128i hi, unsigned byte_count)
 161 {
 162   return _mm_perm_epi8 (lo, hi, GTM_vpperm_shift[byte_count]);
 163 }
 164 #elif defined(__AVX__)
 165 # define HAVE_ARCH_REALIGN_M128I 1
 166 extern "C" const uint64_t GTM_vpalignr_table[16];
 167 inline __m128i
 168 realign_m128i (__m128i lo, __m128i hi, unsigned byte_count)
 169 {
 170   register __m128i xmm0 __asm__("xmm0") = hi;
 171   register __m128i xmm1 __asm__("xmm1") = lo;
 172   __asm("call *%2" : "+x"(xmm0) : "x"(xmm1),
 173         "r"(&GTM_vpalignr_table[byte_count]));
 174   return xmm0;
 175 }
 176 #elif defined(__SSSE3__)
 177 # define HAVE_ARCH_REALIGN_M128I 1
 178 extern "C" const uint64_t GTM_palignr_table[16];
 179 inline __m128i
 180 realign_m128i (__m128i lo, __m128i hi, unsigned byte_count)
 181 {
 182   register __m128i xmm0 __asm__("xmm0") = hi;
 183   register __m128i xmm1 __asm__("xmm1") = lo;
 184   __asm("call *%2" : "+x"(xmm0) : "x"(xmm1),
 185         "r"(&GTM_palignr_table[byte_count]));
 186   return xmm0;
 187 }
 188 #elif defined(__SSE2__)
 189 # define HAVE_ARCH_REALIGN_M128I 1
 190 extern "C" const char GTM_pshift_table[16 * 16];
 191 inline __m128i
 192 realign_m128i (__m128i lo, __m128i hi, unsigned byte_count)
 193 {
 194   register __m128i xmm0 __asm__("xmm0") = lo;
 195   register __m128i xmm1 __asm__("xmm1") = hi;
 196   __asm("call *%2" : "+x"(xmm0), "+x"(xmm1)
 197         : "r"(GTM_pshift_table + byte_count*16));
 198   return xmm0;
 199 }
 200 #endif // XOP, AVX, SSSE3, SSE2
 201
 202 #ifdef HAVE_ARCH_REALIGN_M128I
 203 template<>
 204 inline _ITM_TYPE_M128
 205 unaligned_load2<_ITM_TYPE_M128>(const gtm_cacheline *c1,
 206                                 const gtm_cacheline *c2, size_t ofs)
 207 {
 208   return (_ITM_TYPE_M128)
 209     realign_m128i (c1->m128i[CACHELINE_SIZE / 16 - 1],
 210                    c2->m128i[0], ofs & 15);
 211 }
 212 #endif // HAVE_ARCH_REALIGN_M128I
 213
 214 #ifdef __AVX__
 215 template<>
 216 inline _ITM_TYPE_M256
 217 unaligned_load2<_ITM_TYPE_M256>(const gtm_cacheline *c1,
 218                                 const gtm_cacheline *c2, size_t ofs)
 219 {
 220   __m128i v0, v1;
 221   __m256i r;
 222
 223   v0 = (__m128i) unaligned_load2<_ITM_TYPE_M128>(c1, c2, ofs);
 224   if (ofs < CACHELINE_SIZE - 16)
 225     v1 = v0, v0 = _mm_loadu_si128 ((const __m128i *) &c1->b[ofs]);
 226   else
 227     v1 = _mm_loadu_si128((const __m128i *)&c2->b[ofs + 16 - CACHELINE_SIZE]);
 228
 229   r = _mm256_castsi128_si256 ((__m128i)v0);
 230   r = _mm256_insertf128_si256 (r, (__m128i)v1, 1);
 231   return (_ITM_TYPE_M256) r;
 232 }
 233 #endif // AVX
 234
 235 } // namespace GTM
 236
 237 #endif // LIBITM_X86_UNALIGNED_H