OSDN Git Service

Use partial TLAB regions
authorMathieu Chartier <mathieuc@google.com>
Wed, 19 Apr 2017 00:46:23 +0000 (17:46 -0700)
committerMathieu Chartier <mathieuc@google.com>
Thu, 20 Apr 2017 04:26:56 +0000 (21:26 -0700)
Instead of having 256K TLAB regions, have 256K TLABs split into
16K regions. This fixes pathological cases with multithreaded
allocation that caused many GCs since each thread reserving
256K would often bump the counter past the GC start threshold. Now
threads only bump the counter every 16K.

System wide results (average of 5 samples on N6P):
Total GC time 60s after starting shell: 45s -> 24s
Average .Heap PSS 60s after starting shell: 57900k -> 58682k

BinaryTrees gets around 5% slower, numbers are noisy.

Boot time: 13.302 -> 12.899 (average of 100 runs)

Bug: 35872915
Bug: 36216292

Test: test-art-host

(cherry picked from commit bf48003fa32d2845f2213c0ba31af6677715662d)

Change-Id: I5ab22420124eeadc0a53519c70112274101dfb39

12 files changed:
runtime/asm_support.h
runtime/gc/allocator_type.h
runtime/gc/heap-inl.h
runtime/gc/heap.cc
runtime/gc/space/bump_pointer_space.cc
runtime/gc/space/region_space-inl.h
runtime/gc/space/region_space.cc
runtime/gc/space/region_space.h
runtime/oat.h
runtime/thread-inl.h
runtime/thread.cc
runtime/thread.h

index 4a2e34f..6d271ed 100644 (file)
@@ -98,7 +98,7 @@ ADD_TEST_EQ(THREAD_LOCAL_POS_OFFSET,
 ADD_TEST_EQ(THREAD_LOCAL_END_OFFSET,
             art::Thread::ThreadLocalEndOffset<POINTER_SIZE>().Int32Value())
 // Offset of field Thread::tlsPtr_.thread_local_objects.
-#define THREAD_LOCAL_OBJECTS_OFFSET (THREAD_LOCAL_END_OFFSET + __SIZEOF_POINTER__)
+#define THREAD_LOCAL_OBJECTS_OFFSET (THREAD_LOCAL_END_OFFSET + 2 * __SIZEOF_POINTER__)
 ADD_TEST_EQ(THREAD_LOCAL_OBJECTS_OFFSET,
             art::Thread::ThreadLocalObjectsOffset<POINTER_SIZE>().Int32Value())
 
index 185a9b7..2f1f577 100644 (file)
@@ -35,6 +35,10 @@ enum AllocatorType {
 };
 std::ostream& operator<<(std::ostream& os, const AllocatorType& rhs);
 
+inline constexpr bool IsTLABAllocator(AllocatorType allocator) {
+  return allocator == kAllocatorTypeTLAB || allocator == kAllocatorTypeRegionTLAB;
+}
+
 }  // namespace gc
 }  // namespace art
 
index 394e541..a50d125 100644 (file)
@@ -77,12 +77,11 @@ inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self,
   size_t bytes_allocated;
   size_t usable_size;
   size_t new_num_bytes_allocated = 0;
-  if (allocator == kAllocatorTypeTLAB || allocator == kAllocatorTypeRegionTLAB) {
+  if (IsTLABAllocator(allocator)) {
     byte_count = RoundUp(byte_count, space::BumpPointerSpace::kAlignment);
   }
   // If we have a thread local allocation we don't need to update bytes allocated.
-  if ((allocator == kAllocatorTypeTLAB || allocator == kAllocatorTypeRegionTLAB) &&
-      byte_count <= self->TlabSize()) {
+  if (IsTLABAllocator(allocator) && byte_count <= self->TlabSize()) {
     obj = self->AllocTlab(byte_count);
     DCHECK(obj != nullptr) << "AllocTlab can't fail";
     obj->SetClass(klass);
index 4a25610..28dd627 100644 (file)
@@ -135,6 +135,13 @@ static constexpr double kExtraHeapGrowthMultiplier = kUseReadBarrier ? 1.0 : 0.0
 
 static const char* kRegionSpaceName = "main space (region space)";
 
+// If true, we log all GCs in the both the foreground and background. Used for debugging.
+static constexpr bool kLogAllGCs = false;
+
+// How much we grow the TLAB if we can do it.
+static constexpr size_t kPartialTlabSize = 16 * KB;
+static constexpr bool kUsePartialTlabs = true;
+
 #if defined(__LP64__) || !defined(ADDRESS_SANITIZER)
 // 300 MB (0x12c00000) - (default non-moving space capacity).
 static uint8_t* const kPreferredAllocSpaceBegin =
@@ -2762,7 +2769,7 @@ void Heap::LogGC(GcCause gc_cause, collector::GarbageCollector* collector) {
   const std::vector<uint64_t>& pause_times = GetCurrentGcIteration()->GetPauseTimes();
   // Print the GC if it is an explicit GC (e.g. Runtime.gc()) or a slow GC
   // (mutator time blocked >= long_pause_log_threshold_).
-  bool log_gc = gc_cause == kGcCauseExplicit;
+  bool log_gc = kLogAllGCs || gc_cause == kGcCauseExplicit;
   if (!log_gc && CareAboutPauseTimes()) {
     // GC for alloc pauses the allocating thread, so consider it as a pause.
     log_gc = duration > long_gc_log_threshold_ ||
@@ -4182,7 +4189,21 @@ mirror::Object* Heap::AllocWithNewTLAB(Thread* self,
                                        size_t* usable_size,
                                        size_t* bytes_tl_bulk_allocated) {
   const AllocatorType allocator_type = GetCurrentAllocator();
-  if (allocator_type == kAllocatorTypeTLAB) {
+  if (kUsePartialTlabs && alloc_size <= self->TlabRemainingCapacity()) {
+    DCHECK_GT(alloc_size, self->TlabSize());
+    // There is enough space if we grow the TLAB. Lets do that. This increases the
+    // TLAB bytes.
+    const size_t min_expand_size = alloc_size - self->TlabSize();
+    const size_t expand_bytes = std::max(
+        min_expand_size,
+        std::min(self->TlabRemainingCapacity() - self->TlabSize(), kPartialTlabSize));
+    if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, expand_bytes, grow))) {
+      return nullptr;
+    }
+    *bytes_tl_bulk_allocated = expand_bytes;
+    self->ExpandTlab(expand_bytes);
+    DCHECK_LE(alloc_size, self->TlabSize());
+  } else if (allocator_type == kAllocatorTypeTLAB) {
     DCHECK(bump_pointer_space_ != nullptr);
     const size_t new_tlab_size = alloc_size + kDefaultTLABSize;
     if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, new_tlab_size, grow))) {
@@ -4202,15 +4223,18 @@ mirror::Object* Heap::AllocWithNewTLAB(Thread* self,
       if (LIKELY(!IsOutOfMemoryOnAllocation(allocator_type,
                                             space::RegionSpace::kRegionSize,
                                             grow))) {
+        const size_t new_tlab_size = kUsePartialTlabs
+            ? std::max(alloc_size, kPartialTlabSize)
+            : gc::space::RegionSpace::kRegionSize;
         // Try to allocate a tlab.
-        if (!region_space_->AllocNewTlab(self)) {
+        if (!region_space_->AllocNewTlab(self, new_tlab_size)) {
           // Failed to allocate a tlab. Try non-tlab.
           return region_space_->AllocNonvirtual<false>(alloc_size,
                                                        bytes_allocated,
                                                        usable_size,
                                                        bytes_tl_bulk_allocated);
         }
-        *bytes_tl_bulk_allocated = space::RegionSpace::kRegionSize;
+        *bytes_tl_bulk_allocated = new_tlab_size;
         // Fall-through to using the TLAB below.
       } else {
         // Check OOME for a non-tlab allocation.
index 1303d77..426b332 100644 (file)
@@ -249,7 +249,7 @@ uint64_t BumpPointerSpace::GetObjectsAllocated() {
 void BumpPointerSpace::RevokeThreadLocalBuffersLocked(Thread* thread) {
   objects_allocated_.FetchAndAddSequentiallyConsistent(thread->GetThreadLocalObjectsAllocated());
   bytes_allocated_.FetchAndAddSequentiallyConsistent(thread->GetThreadLocalBytesAllocated());
-  thread->SetTlab(nullptr, nullptr);
+  thread->SetTlab(nullptr, nullptr, nullptr);
 }
 
 bool BumpPointerSpace::AllocNewTlab(Thread* self, size_t bytes) {
@@ -259,7 +259,7 @@ bool BumpPointerSpace::AllocNewTlab(Thread* self, size_t bytes) {
   if (start == nullptr) {
     return false;
   }
-  self->SetTlab(start, start + bytes);
+  self->SetTlab(start, start + bytes, start + bytes);
   return true;
 }
 
index 5809027..3910a03 100644 (file)
@@ -18,6 +18,7 @@
 #define ART_RUNTIME_GC_SPACE_REGION_SPACE_INL_H_
 
 #include "region_space.h"
+#include "thread-inl.h"
 
 namespace art {
 namespace gc {
@@ -335,6 +336,28 @@ mirror::Object* RegionSpace::AllocLarge(size_t num_bytes, size_t* bytes_allocate
   return nullptr;
 }
 
+inline size_t RegionSpace::Region::BytesAllocated() const {
+  if (IsLarge()) {
+    DCHECK_LT(begin_ + kRegionSize, Top());
+    return static_cast<size_t>(Top() - begin_);
+  } else if (IsLargeTail()) {
+    DCHECK_EQ(begin_, Top());
+    return 0;
+  } else {
+    DCHECK(IsAllocated()) << static_cast<uint>(state_);
+    DCHECK_LE(begin_, Top());
+    size_t bytes;
+    if (is_a_tlab_) {
+      bytes = thread_->GetThreadLocalBytesAllocated();
+    } else {
+      bytes = static_cast<size_t>(Top() - begin_);
+    }
+    DCHECK_LE(bytes, kRegionSize);
+    return bytes;
+  }
+}
+
+
 }  // namespace space
 }  // namespace gc
 }  // namespace art
index 1ad4843..09b4a3a 100644 (file)
@@ -427,7 +427,7 @@ void RegionSpace::RecordAlloc(mirror::Object* ref) {
   r->objects_allocated_.FetchAndAddSequentiallyConsistent(1);
 }
 
-bool RegionSpace::AllocNewTlab(Thread* self) {
+bool RegionSpace::AllocNewTlab(Thread* self, size_t min_bytes) {
   MutexLock mu(self, region_lock_);
   RevokeThreadLocalBuffersLocked(self);
   // Retain sufficient free regions for full evacuation.
@@ -443,7 +443,7 @@ bool RegionSpace::AllocNewTlab(Thread* self) {
       r->SetTop(r->End());
       r->is_a_tlab_ = true;
       r->thread_ = self;
-      self->SetTlab(r->Begin(), r->End());
+      self->SetTlab(r->Begin(), r->Begin() + min_bytes, r->End());
       return true;
     }
   }
@@ -463,13 +463,13 @@ void RegionSpace::RevokeThreadLocalBuffersLocked(Thread* thread) {
     DCHECK_ALIGNED(tlab_start, kRegionSize);
     Region* r = RefToRegionLocked(reinterpret_cast<mirror::Object*>(tlab_start));
     DCHECK(r->IsAllocated());
-    DCHECK_EQ(thread->GetThreadLocalBytesAllocated(), kRegionSize);
+    DCHECK_LE(thread->GetThreadLocalBytesAllocated(), kRegionSize);
     r->RecordThreadLocalAllocations(thread->GetThreadLocalObjectsAllocated(),
                                     thread->GetThreadLocalBytesAllocated());
     r->is_a_tlab_ = false;
     r->thread_ = nullptr;
   }
-  thread->SetTlab(nullptr, nullptr);
+  thread->SetTlab(nullptr, nullptr, nullptr);
 }
 
 size_t RegionSpace::RevokeAllThreadLocalBuffers() {
index 2537929..80eecca 100644 (file)
@@ -234,7 +234,7 @@ class RegionSpace FINAL : public ContinuousMemMapAllocSpace {
   }
 
   void RecordAlloc(mirror::Object* ref) REQUIRES(!region_lock_);
-  bool AllocNewTlab(Thread* self) REQUIRES(!region_lock_);
+  bool AllocNewTlab(Thread* self, size_t min_bytes) REQUIRES(!region_lock_);
 
   uint32_t Time() {
     return time_;
@@ -417,21 +417,7 @@ class RegionSpace FINAL : public ContinuousMemMapAllocSpace {
       return live_bytes_;
     }
 
-    size_t BytesAllocated() const {
-      if (IsLarge()) {
-        DCHECK_LT(begin_ + kRegionSize, Top());
-        return static_cast<size_t>(Top() - begin_);
-      } else if (IsLargeTail()) {
-        DCHECK_EQ(begin_, Top());
-        return 0;
-      } else {
-        DCHECK(IsAllocated()) << static_cast<uint>(state_);
-        DCHECK_LE(begin_, Top());
-        size_t bytes = static_cast<size_t>(Top() - begin_);
-        DCHECK_LE(bytes, kRegionSize);
-        return bytes;
-      }
-    }
+    size_t BytesAllocated() const;
 
     size_t ObjectsAllocated() const {
       if (IsLarge()) {
@@ -476,7 +462,7 @@ class RegionSpace FINAL : public ContinuousMemMapAllocSpace {
       DCHECK_EQ(Top(), end_);
       objects_allocated_.StoreRelaxed(num_objects);
       top_.StoreRelaxed(begin_ + num_bytes);
-      DCHECK_EQ(Top(), end_);
+      DCHECK_LE(Top(), end_);
     }
 
    private:
index 58ea91b..0570625 100644 (file)
@@ -32,7 +32,7 @@ class InstructionSetFeatures;
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '1', '1', '8', '\0' };  // ARM64 Read barriers thunks.
+  static constexpr uint8_t kOatVersion[] = { '1', '1', '9', '\0' };  // Add thread_local_limit.
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
index 02a1e4d..aa769fa 100644 (file)
@@ -303,10 +303,6 @@ inline void Thread::VerifyStack() {
   }
 }
 
-inline size_t Thread::TlabSize() const {
-  return tlsPtr_.thread_local_end - tlsPtr_.thread_local_pos;
-}
-
 inline mirror::Object* Thread::AllocTlab(size_t bytes) {
   DCHECK_GE(TlabSize(), bytes);
   ++tlsPtr_.thread_local_objects;
index a8a03c7..f887aaa 100644 (file)
@@ -3468,11 +3468,13 @@ void Thread::SetStackEndForStackOverflow() {
   }
 }
 
-void Thread::SetTlab(uint8_t* start, uint8_t* end) {
+void Thread::SetTlab(uint8_t* start, uint8_t* end, uint8_t* limit) {
   DCHECK_LE(start, end);
+  DCHECK_LE(end, limit);
   tlsPtr_.thread_local_start = start;
   tlsPtr_.thread_local_pos  = tlsPtr_.thread_local_start;
   tlsPtr_.thread_local_end = end;
+  tlsPtr_.thread_local_limit = limit;
   tlsPtr_.thread_local_objects = 0;
 }
 
index 4d5d644..5251012 100644 (file)
@@ -1035,10 +1035,24 @@ class Thread {
   void ResetQuickAllocEntryPointsForThread(bool is_marking);
 
   // Returns the remaining space in the TLAB.
-  size_t TlabSize() const;
+  size_t TlabSize() const {
+    return tlsPtr_.thread_local_end - tlsPtr_.thread_local_pos;
+  }
+
+  // Returns the remaining space in the TLAB if we were to expand it to maximum capacity.
+  size_t TlabRemainingCapacity() const {
+    return tlsPtr_.thread_local_limit - tlsPtr_.thread_local_pos;
+  }
+
+  // Expand the TLAB by a fixed number of bytes. There must be enough capacity to do so.
+  void ExpandTlab(size_t bytes) {
+    tlsPtr_.thread_local_end += bytes;
+    DCHECK_LE(tlsPtr_.thread_local_end, tlsPtr_.thread_local_limit);
+  }
+
   // Doesn't check that there is room.
   mirror::Object* AllocTlab(size_t bytes);
-  void SetTlab(uint8_t* start, uint8_t* end);
+  void SetTlab(uint8_t* start, uint8_t* end, uint8_t* limit);
   bool HasTlab() const;
   uint8_t* GetTlabStart() {
     return tlsPtr_.thread_local_start;
@@ -1451,6 +1465,7 @@ class Thread {
       frame_id_to_shadow_frame(nullptr), name(nullptr), pthread_self(0),
       last_no_thread_suspension_cause(nullptr), checkpoint_function(nullptr),
       thread_local_start(nullptr), thread_local_pos(nullptr), thread_local_end(nullptr),
+      thread_local_limit(nullptr),
       thread_local_objects(0), mterp_current_ibase(nullptr), mterp_default_ibase(nullptr),
       mterp_alt_ibase(nullptr), thread_local_alloc_stack_top(nullptr),
       thread_local_alloc_stack_end(nullptr),
@@ -1577,6 +1592,10 @@ class Thread {
     uint8_t* thread_local_pos;
     uint8_t* thread_local_end;
 
+    // Thread local limit is how much we can expand the thread local buffer to, it is greater or
+    // equal to thread_local_end.
+    uint8_t* thread_local_limit;
+
     size_t thread_local_objects;
 
     // Entrypoint function pointers.