atomicops.h
1 // ©2013-2015 Cameron Desrochers.
2 // Distributed under the simplified BSD license (see the license file that
3 // should have come with this header).
4 // Uses Jeff Preshing's semaphore implementation (under the terms of its
5 // separate zlib license, embedded below).
6 
7 #pragma once
8 
9 // Provides portable (VC++2010+, Intel ICC 13, GCC 4.7+, and anything C++11 compliant) implementation
10 // of low-level memory barriers, plus a few semi-portable utility macros (for inlining and alignment).
11 // Also has a basic atomic type (limited to hardware-supported atomics with no memory ordering guarantees).
12 // Uses the AE_* prefix for macros (historical reasons), and the "moodycamel" namespace for symbols.
13 
14 #include <cassert>
15 #include <type_traits>
16 
17 
18 // Platform detection
19 #if defined(__INTEL_COMPILER)
20 #define AE_ICC
21 #elif defined(_MSC_VER)
22 #define AE_VCPP
23 #elif defined(__GNUC__)
24 #define AE_GCC
25 #endif
26 
27 #if defined(_M_IA64) || defined(__ia64__)
28 #define AE_ARCH_IA64
29 #elif defined(_WIN64) || defined(__amd64__) || defined(_M_X64) || defined(__x86_64__)
30 #define AE_ARCH_X64
31 #elif defined(_M_IX86) || defined(__i386__)
32 #define AE_ARCH_X86
33 #elif defined(_M_PPC) || defined(__powerpc__)
34 #define AE_ARCH_PPC
35 #else
36 #define AE_ARCH_UNKNOWN
37 #endif
38 
39 
40 // AE_UNUSED
41 #define AE_UNUSED(x) ((void)x)
42 
43 
44 // AE_FORCEINLINE
45 #if defined(AE_VCPP) || defined(AE_ICC)
46 #define AE_FORCEINLINE __forceinline
47 #elif defined(AE_GCC)
48 //#define AE_FORCEINLINE __attribute__((always_inline))
49 #define AE_FORCEINLINE inline
50 #else
51 #define AE_FORCEINLINE inline
52 #endif
53 
54 
55 // AE_ALIGN
56 #if defined(AE_VCPP) || defined(AE_ICC)
57 #define AE_ALIGN(x) __declspec(align(x))
58 #elif defined(AE_GCC)
59 #define AE_ALIGN(x) __attribute__((aligned(x)))
60 #else
61 // Assume GCC compliant syntax...
62 #define AE_ALIGN(x) __attribute__((aligned(x)))
63 #endif
64 
65 
66 // Portable atomic fences implemented below:
67 
68 namespace moodycamel {
69 
70 enum memory_order {
71  memory_order_relaxed,
72  memory_order_acquire,
73  memory_order_release,
74  memory_order_acq_rel,
75  memory_order_seq_cst,
76 
77  // memory_order_sync: Forces a full sync:
78  // #LoadLoad, #LoadStore, #StoreStore, and most significantly, #StoreLoad
79  memory_order_sync = memory_order_seq_cst
80 };
81 
82 } // end namespace moodycamel
83 
84 #if (defined(AE_VCPP) && (_MSC_VER < 1700 || defined(__cplusplus_cli))) || defined(AE_ICC)
85 // VS2010 and ICC13 don't support std::atomic_*_fence, implement our own fences
86 
87 #include <intrin.h>
88 
89 #if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
90 #define AeFullSync _mm_mfence
91 #define AeLiteSync _mm_mfence
92 #elif defined(AE_ARCH_IA64)
93 #define AeFullSync __mf
94 #define AeLiteSync __mf
95 #elif defined(AE_ARCH_PPC)
96 #include <ppcintrinsics.h>
97 #define AeFullSync __sync
98 #define AeLiteSync __lwsync
99 #endif
100 
101 
102 #ifdef AE_VCPP
103 #pragma warning(push)
104 #pragma warning(disable: 4365) // Disable erroneous 'conversion from long to unsigned int, signed/unsigned mismatch' error when using `assert`
105 #ifdef __cplusplus_cli
106 #pragma managed(push, off)
107 #endif
108 #endif
109 
110 namespace moodycamel {
111 
112 AE_FORCEINLINE void compiler_fence(memory_order order)
113 {
114  switch (order) {
115  case memory_order_relaxed: break;
116  case memory_order_acquire: _ReadBarrier(); break;
117  case memory_order_release: _WriteBarrier(); break;
118  case memory_order_acq_rel: _ReadWriteBarrier(); break;
119  case memory_order_seq_cst: _ReadWriteBarrier(); break;
120  default: assert(false);
121  }
122 }
123 
124 // x86/x64 have a strong memory model -- all loads and stores have
125 // acquire and release semantics automatically (so only need compiler
126 // barriers for those).
127 #if defined(AE_ARCH_X86) || defined(AE_ARCH_X64)
128 AE_FORCEINLINE void fence(memory_order order)
129 {
130  switch (order) {
131  case memory_order_relaxed: break;
132  case memory_order_acquire: _ReadBarrier(); break;
133  case memory_order_release: _WriteBarrier(); break;
134  case memory_order_acq_rel: _ReadWriteBarrier(); break;
135  case memory_order_seq_cst:
136  _ReadWriteBarrier();
137  AeFullSync();
138  _ReadWriteBarrier();
139  break;
140  default: assert(false);
141  }
142 }
143 #else
144 AE_FORCEINLINE void fence(memory_order order)
145 {
146  // Non-specialized arch, use heavier memory barriers everywhere just in case :-(
147  switch (order) {
148  case memory_order_relaxed:
149  break;
150  case memory_order_acquire:
151  _ReadBarrier();
152  AeLiteSync();
153  _ReadBarrier();
154  break;
155  case memory_order_release:
156  _WriteBarrier();
157  AeLiteSync();
158  _WriteBarrier();
159  break;
160  case memory_order_acq_rel:
161  _ReadWriteBarrier();
162  AeLiteSync();
163  _ReadWriteBarrier();
164  break;
165  case memory_order_seq_cst:
166  _ReadWriteBarrier();
167  AeFullSync();
168  _ReadWriteBarrier();
169  break;
170  default: assert(false);
171  }
172 }
173 #endif
174 } // end namespace moodycamel
175 #else
176 // Use standard library of atomics
177 #include <atomic>
178 
179 namespace moodycamel {
180 
181 AE_FORCEINLINE void compiler_fence(memory_order order)
182 {
183  switch (order) {
184  case memory_order_relaxed: break;
185  case memory_order_acquire: std::atomic_signal_fence(std::memory_order_acquire); break;
186  case memory_order_release: std::atomic_signal_fence(std::memory_order_release); break;
187  case memory_order_acq_rel: std::atomic_signal_fence(std::memory_order_acq_rel); break;
188  case memory_order_seq_cst: std::atomic_signal_fence(std::memory_order_seq_cst); break;
189  default: assert(false);
190  }
191 }
192 
193 AE_FORCEINLINE void fence(memory_order order)
194 {
195  switch (order) {
196  case memory_order_relaxed: break;
197  case memory_order_acquire: std::atomic_thread_fence(std::memory_order_acquire); break;
198  case memory_order_release: std::atomic_thread_fence(std::memory_order_release); break;
199  case memory_order_acq_rel: std::atomic_thread_fence(std::memory_order_acq_rel); break;
200  case memory_order_seq_cst: std::atomic_thread_fence(std::memory_order_seq_cst); break;
201  default: assert(false);
202  }
203 }
204 
205 } // end namespace moodycamel
206 
207 #endif
208 
209 
210 #if !defined(AE_VCPP) || (_MSC_VER >= 1700 && !defined(__cplusplus_cli))
211 #define AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
212 #endif
213 
214 #ifdef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
215 #include <atomic>
216 #endif
217 #include <utility>
218 
219 // WARNING: *NOT* A REPLACEMENT FOR std::atomic. READ CAREFULLY:
220 // Provides basic support for atomic variables -- no memory ordering guarantees are provided.
221 // The guarantee of atomicity is only made for types that already have atomic load and store guarantees
222 // at the hardware level -- on most platforms this generally means aligned pointers and integers (only).
223 namespace moodycamel {
224 template<typename T>
226 {
227 public:
228  weak_atomic() { }
229 #ifdef AE_VCPP
230 #pragma warning(disable: 4100) // Get rid of (erroneous) 'unreferenced formal parameter' warning
231 #endif
232  template<typename U> weak_atomic(U&& x) : value(std::forward<U>(x)) { }
233 #ifdef __cplusplus_cli
234  // Work around bug with universal reference/nullptr combination that only appears when /clr is on
235  weak_atomic(nullptr_t) : value(nullptr) { }
236 #endif
237  weak_atomic(weak_atomic const& other) : value(other.value) { }
238  weak_atomic(weak_atomic&& other) : value(std::move(other.value)) { }
239 #ifdef AE_VCPP
240 #pragma warning(default: 4100)
241 #endif
242 
243  AE_FORCEINLINE operator T() const { return load(); }
244 
245 
246 #ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
247  template<typename U> AE_FORCEINLINE weak_atomic const& operator=(U&& x) { value = std::forward<U>(x); return *this; }
248  AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other) { value = other.value; return *this; }
249 
250  AE_FORCEINLINE T load() const { return value; }
251 
252  AE_FORCEINLINE T fetch_add_acquire(T increment)
253  {
254 #if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
255  if (sizeof(T) == 4) return _InterlockedExchangeAdd((long volatile*)&value, (long)increment);
256 #if defined(_M_AMD64)
257  else if (sizeof(T) == 8) return _InterlockedExchangeAdd64((long long volatile*)&value, (long long)increment);
258 #endif
259 #else
260 #error Unsupported platform
261 #endif
262  assert(false && "T must be either a 32 or 64 bit type");
263  return value;
264  }
265 
266  AE_FORCEINLINE T fetch_add_release(T increment)
267  {
268 #if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
269  if (sizeof(T) == 4) return _InterlockedExchangeAdd((long volatile*)&value, (long)increment);
270 #if defined(_M_AMD64)
271  else if (sizeof(T) == 8) return _InterlockedExchangeAdd64((long long volatile*)&value, (long long)increment);
272 #endif
273 #else
274 #error Unsupported platform
275 #endif
276  assert(false && "T must be either a 32 or 64 bit type");
277  return value;
278  }
279 #else
280  template<typename U>
281  AE_FORCEINLINE weak_atomic const& operator=(U&& x)
282  {
283  value.store(std::forward<U>(x), std::memory_order_relaxed);
284  return *this;
285  }
286 
287  AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other)
288  {
289  value.store(other.value.load(std::memory_order_relaxed), std::memory_order_relaxed);
290  return *this;
291  }
292 
293  AE_FORCEINLINE T load() const { return value.load(std::memory_order_relaxed); }
294 
295  AE_FORCEINLINE T fetch_add_acquire(T increment)
296  {
297  return value.fetch_add(increment, std::memory_order_acquire);
298  }
299 
300  AE_FORCEINLINE T fetch_add_release(T increment)
301  {
302  return value.fetch_add(increment, std::memory_order_release);
303  }
304 #endif
305 
306 
307 private:
308 #ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
309  // No std::atomic support, but still need to circumvent compiler optimizations.
310  // `volatile` will make memory access slow, but is guaranteed to be reliable.
311  volatile T value;
312 #else
313  std::atomic<T> value;
314 #endif
315 };
316 
317 } // end namespace moodycamel
318 
319 
320 
321 // Portable single-producer, single-consumer semaphore below:
322 
323 #if defined(_WIN32)
324 // Avoid including windows.h in a header; we only need a handful of
325 // items, so we'll redeclare them here (this is relatively safe since
326 // the API generally has to remain stable between Windows versions).
327 // I know this is an ugly hack but it still beats polluting the global
328 // namespace with thousands of generic names or adding a .cpp for nothing.
329 extern "C" {
330  struct _SECURITY_ATTRIBUTES;
331  __declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName);
332  __declspec(dllimport) int __stdcall CloseHandle(void* hObject);
333  __declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds);
334  __declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount);
335 }
336 #elif defined(__MACH__)
337 #include <mach/mach.h>
338 #elif defined(__unix__)
339 #include <semaphore.h>
340 #endif
341 
342 namespace moodycamel
343 {
344  // Code in the spsc_sema namespace below is an adaptation of Jeff Preshing's
345  // portable + lightweight semaphore implementations, originally from
346  // https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h
347  // LICENSE:
348  // Copyright (c) 2015 Jeff Preshing
349  //
350  // This software is provided 'as-is', without any express or implied
351  // warranty. In no event will the authors be held liable for any damages
352  // arising from the use of this software.
353  //
354  // Permission is granted to anyone to use this software for any purpose,
355  // including commercial applications, and to alter it and redistribute it
356  // freely, subject to the following restrictions:
357  //
358  // 1. The origin of this software must not be misrepresented; you must not
359  // claim that you wrote the original software. If you use this software
360  // in a product, an acknowledgement in the product documentation would be
361  // appreciated but is not required.
362  // 2. Altered source versions must be plainly marked as such, and must not be
363  // misrepresented as being the original software.
364  // 3. This notice may not be removed or altered from any source distribution.
365  namespace spsc_sema
366  {
367 #if defined(_WIN32)
368  class Semaphore
369  {
370  private:
371  void* m_hSema;
372 
373  Semaphore(const Semaphore& other);
374  Semaphore& operator=(const Semaphore& other);
375 
376  public:
377  Semaphore(int initialCount = 0)
378  {
379  assert(initialCount >= 0);
380  const long maxLong = 0x7fffffff;
381  m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr);
382  }
383 
384  ~Semaphore()
385  {
386  CloseHandle(m_hSema);
387  }
388 
389  void wait()
390  {
391  const unsigned long infinite = 0xffffffff;
392  WaitForSingleObject(m_hSema, infinite);
393  }
394 
395  void signal(int count = 1)
396  {
397  ReleaseSemaphore(m_hSema, count, nullptr);
398  }
399  };
400 #elif defined(__MACH__)
401  //---------------------------------------------------------
402  // Semaphore (Apple iOS and OSX)
403  // Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html
404  //---------------------------------------------------------
405  class Semaphore
406  {
407  private:
408  semaphore_t m_sema;
409 
410  Semaphore(const Semaphore& other);
411  Semaphore& operator=(const Semaphore& other);
412 
413  public:
414  Semaphore(int initialCount = 0)
415  {
416  assert(initialCount >= 0);
417  semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount);
418  }
419 
420  ~Semaphore()
421  {
422  semaphore_destroy(mach_task_self(), m_sema);
423  }
424 
425  void wait()
426  {
427  semaphore_wait(m_sema);
428  }
429 
430  void signal()
431  {
432  semaphore_signal(m_sema);
433  }
434 
435  void signal(int count)
436  {
437  while (count-- > 0)
438  {
439  semaphore_signal(m_sema);
440  }
441  }
442  };
443 #elif defined(__unix__)
444  //---------------------------------------------------------
445  // Semaphore (POSIX, Linux)
446  //---------------------------------------------------------
447  class Semaphore
448  {
449  private:
450  sem_t m_sema;
451 
452  Semaphore(const Semaphore& other);
453  Semaphore& operator=(const Semaphore& other);
454 
455  public:
456  Semaphore(int initialCount = 0)
457  {
458  assert(initialCount >= 0);
459  sem_init(&m_sema, 0, initialCount);
460  }
461 
462  ~Semaphore()
463  {
464  sem_destroy(&m_sema);
465  }
466 
467  void wait()
468  {
469  // http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error
470  int rc;
471  do
472  {
473  rc = sem_wait(&m_sema);
474  }
475  while (rc == -1 && errno == EINTR);
476  }
477 
478  void signal()
479  {
480  sem_post(&m_sema);
481  }
482 
483  void signal(int count)
484  {
485  while (count-- > 0)
486  {
487  sem_post(&m_sema);
488  }
489  }
490  };
491 #else
492 #error Unsupported platform! (No semaphore wrapper available)
493 #endif
494 
495  //---------------------------------------------------------
496  // LightweightSemaphore
497  //---------------------------------------------------------
499  {
500  public:
501  typedef std::make_signed<std::size_t>::type ssize_t;
502 
503  private:
504  weak_atomic<ssize_t> m_count;
505  Semaphore m_sema;
506 
507  void waitWithPartialSpinning()
508  {
509  ssize_t oldCount;
510  // Is there a better way to set the initial spin count?
511  // If we lower it to 1000, testBenaphore becomes 15x slower on my Core i7-5930K Windows PC,
512  // as threads start hitting the kernel semaphore.
513  int spin = 10000;
514  while (--spin >= 0)
515  {
516  if (m_count.load() > 0)
517  {
518  m_count.fetch_add_acquire(-1);
519  return;
520  }
521  compiler_fence(memory_order_acquire); // Prevent the compiler from collapsing the loop.
522  }
523  oldCount = m_count.fetch_add_acquire(-1);
524  if (oldCount <= 0)
525  {
526  m_sema.wait();
527  }
528  }
529 
530  public:
531  LightweightSemaphore(ssize_t initialCount = 0) : m_count(initialCount)
532  {
533  assert(initialCount >= 0);
534  }
535 
536  bool tryWait()
537  {
538  if (m_count.load() > 0)
539  {
540  m_count.fetch_add_acquire(-1);
541  return true;
542  }
543  return false;
544  }
545 
546  void wait()
547  {
548  if (!tryWait())
549  waitWithPartialSpinning();
550  }
551 
552  void signal(ssize_t count = 1)
553  {
554  assert(count >= 0);
555  ssize_t oldCount = m_count.fetch_add_release(count);
556  assert(oldCount >= -1);
557  if (oldCount < 0)
558  {
559  m_sema.signal(1);
560  }
561  }
562 
563  ssize_t availableApprox() const
564  {
565  ssize_t count = m_count.load();
566  return count > 0 ? count : 0;
567  }
568  };
569  } // end namespace spsc_sema
570 } // end namespace moodycamel
571 
572 #if defined(AE_VCPP) && (_MSC_VER < 1700 || defined(__cplusplus_cli))
573 #pragma warning(pop)
574 #ifdef __cplusplus_cli
575 #pragma managed(pop)
576 #endif
577 #endif
Definition: atomicops.h:225