From fc56d14268e6f9b4ce68780bf24aa3423749f6f7 Mon Sep 17 00:00:00 2001 From: Rainer Schuetze Date: Sun, 21 Apr 2019 09:35:33 +0200 Subject: [PATCH 1/7] createLowLevelThread: add an optional DLL unload callback that is called if the DLL is supposed to be unloaded --- src/core/sys/windows/dll.d | 95 ++++++++++++++++-- src/core/thread.d | 176 ++++++++++++++++++++++++++++++++-- src/rt/msvc.c | 3 + test/shared/src/dllgc.d | 63 ++++++++++++ test/shared/src/dllrefcount.d | 20 ++++ test/shared/win64.mak | 23 +++++ win32.mak | 13 ++- win64.mak | 6 +- 8 files changed, 375 insertions(+), 24 deletions(-) create mode 100644 test/shared/src/dllgc.d create mode 100644 test/shared/src/dllrefcount.d create mode 100644 test/shared/win64.mak diff --git a/src/core/sys/windows/dll.d b/src/core/sys/windows/dll.d index 5c7024f6c1..915e444114 100644 --- a/src/core/sys/windows/dll.d +++ b/src/core/sys/windows/dll.d @@ -55,11 +55,11 @@ extern (C) // rt.minfo } private: -version (Win32) -{ struct dll_aux { // don't let symbols leak into other modules +version (Win32) +{ struct LdrpTlsListEntry { LdrpTlsListEntry* next; @@ -225,6 +225,7 @@ struct dll_aux // let the old array leak, in case a oncurrent thread is still relying on it return true; } +} // Win32 alias bool BOOLEAN; @@ -241,7 +242,8 @@ struct dll_aux LIST_ENTRY* prev; } - // the following structures can be found here: http://undocumented.ntinternals.net/ + // the following structures can be found here: + // https://www.geoffchappell.com/studies/windows/win32/ntdll/structs/ldr_data_table_entry.htm // perhaps this should be same as LDR_DATA_TABLE_ENTRY, which is introduced with PEB_LDR_DATA struct LDR_MODULE { @@ -254,10 +256,22 @@ struct dll_aux UNICODE_STRING FullDllName; UNICODE_STRING BaseDllName; ULONG Flags; - SHORT LoadCount; + SHORT LoadCount; // obsolete after Version 6.1 SHORT TlsIndex; LIST_ENTRY HashTableEntry; ULONG TimeDateStamp; + PVOID EntryPointActivationContext; + PVOID PatchInformation; + LDR_DDAG_NODE *DdagNode; // starting with Version 6.2 + } + + struct LDR_DDAG_NODE + { + LIST_ENTRY Modules; + void* ServiceTagList; // LDR_SERVICE_TAG_RECORD + ULONG LoadCount; + ULONG ReferenceCount; // Version 10: ULONG LoadWhileUnloadingCount; + ULONG DependencyCount; // Version 10: ULONG LowestLink; } struct PEB_LDR_DATA @@ -270,7 +284,7 @@ struct dll_aux LIST_ENTRY InInitializationOrderModuleList; } - static LDR_MODULE* findLdrModule( HINSTANCE hInstance, void** peb ) nothrow + static LDR_MODULE* findLdrModule( HINSTANCE hInstance, void** peb ) nothrow @nogc { PEB_LDR_DATA* ldrData = cast(PEB_LDR_DATA*) peb[3]; LIST_ENTRY* root = &ldrData.InLoadOrderModuleList; @@ -294,7 +308,6 @@ struct dll_aux return true; } } -} public: /* ***************************************************** @@ -359,6 +372,63 @@ bool dll_fixTLS( HINSTANCE hInstance, void* tlsstart, void* tlsend, void* tls_ca } } +private extern (Windows) ULONGLONG VerSetConditionMask(ULONGLONG, DWORD, BYTE) nothrow @nogc; + +private bool isWindows8OrLater() nothrow @nogc +{ + OSVERSIONINFOEXW osvi; + osvi.dwOSVersionInfoSize = osvi.sizeof; + DWORDLONG dwlConditionMask = VerSetConditionMask( + VerSetConditionMask( + VerSetConditionMask( + 0, VER_MAJORVERSION, VER_GREATER_EQUAL), + VER_MINORVERSION, VER_GREATER_EQUAL), + VER_SERVICEPACKMAJOR, VER_GREATER_EQUAL); + + osvi.dwMajorVersion = 6; + osvi.dwMinorVersion = 2; + osvi.wServicePackMajor = 0; + + return VerifyVersionInfoW(&osvi, VER_MAJORVERSION | VER_MINORVERSION | VER_SERVICEPACKMAJOR, dwlConditionMask) != FALSE; +} + +/* ***************************************************** + * Get the process reference count for the given DLL handle + * Params: + * hInstance = DLL instance handle + * Returns: + * the reference count for the DLL in the current process, + * -1 if the DLL is implicitely loaded with the process + * or -2 if the DLL handle is invalid + */ +int dll_getRefCount( HINSTANCE hInstance ) nothrow @nogc +{ + void** peb; + version (Win64) + { + asm pure nothrow @nogc + { + mov RAX, 0x60; + mov RAX,GS:[RAX]; + mov peb, RAX; + } + } + else version (Win32) + { + asm pure nothrow @nogc + { + mov EAX,FS:[0x30]; + mov peb, EAX; + } + } + dll_aux.LDR_MODULE *ldrMod = dll_aux.findLdrModule( hInstance, peb ); + if ( !ldrMod ) + return -2; // not in module list, bail out + if (isWindows8OrLater()) + return ldrMod.DdagNode.LoadCount; + return ldrMod.LoadCount; +} + // fixup TLS storage, initialize runtime and attach to threads // to be called from DllMain with reason DLL_PROCESS_ATTACH bool dll_process_attach( HINSTANCE hInstance, bool attach_threads, @@ -415,11 +485,16 @@ void dll_process_detach( HINSTANCE hInstance, bool detach_threads = true ) // detach from all other threads if ( detach_threads ) enumProcessThreads( - function (uint id, void* context) { - if ( id != GetCurrentThreadId() && thread_findByAddr( id ) ) + function (uint id, void* context) + { + if ( id != GetCurrentThreadId() ) { - thread_moduleTlsDtor( id ); - thread_detachByAddr( id ); + if ( auto t = thread_findByAddr( id ) ) + { + thread_moduleTlsDtor( id ); + if ( !t.isMainThread() ) + thread_detachByAddr( id ); + } } return true; }, null ); diff --git a/src/core/thread.d b/src/core/thread.d index 655bdb35ea..d012d6b8e5 100644 --- a/src/core/thread.d +++ b/src/core/thread.d @@ -882,6 +882,17 @@ class Thread } } + /** + * Tests whether this thread is the main thread, i.e. the thread + * that initialized the runtime + * + * Returns: + * true if the thread is the main thread + */ + final @property bool isMainThread() nothrow @nogc + { + return this is sm_main; + } /** * Tests whether this thread is running. @@ -5636,8 +5647,15 @@ version (Posix) // lowlovel threading support private { + struct ll_ThreadData + { + ThreadID tid; + version (Windows) + void delegate() nothrow cbDllUnload; + } + __gshared size_t ll_nThreads; - __gshared ThreadID* ll_pThreads; + __gshared ll_ThreadData* ll_pThreads; __gshared align(Mutex.alignof) void[__traits(classInstanceSize, Mutex)] ll_lock; @@ -5664,21 +5682,151 @@ private foreach (i; 0 .. ll_nThreads) { - if (tid is ll_pThreads[i]) + if (tid is ll_pThreads[i].tid) { import core.stdc.string : memmove; - memmove(ll_pThreads + i, ll_pThreads + i + 1, ThreadID.sizeof * (ll_nThreads - i - 1)); + memmove(ll_pThreads + i, ll_pThreads + i + 1, ll_ThreadData.sizeof * (ll_nThreads - i - 1)); --ll_nThreads; // no need to minimize, next add will do break; } } } -} -version (Windows) + version (Windows): + // If the runtime is dynamically loaded as a DLL, there is a problem with + // threads still running when the DLL is supposed to be unloaded: + // + // - with the VC runtime starting with VS2015 (i.e. using the Universal CRT) + // a thread created with _beginthreadex increments the DLL reference count + // and decrements it when done, so that the DLL is no longer unloaded unless + // all the threads have terminated. With the DLL reference count held up + // by a thread that is only stopped by a signal from a static destructor or + // the termination of the runtime will cause the DLL to never be unloaded. + // + // - with the DigitalMars runtime and VC runtime up to VS2013, the thread + // continues to run, but crashes once the DLL is unloaded from memory as + // the code memory is no longer accessible. Stopping the threads is not possible + // from within the runtime termination as it is invoked from + // DllMain(DLL_PROCESS_DETACH) holding a lock that prevents threads from + // terminating. + // + // Solution: start a watchdog thread that keeps the DLL reference count above 0 and + // checks it periodically. If it is equal to 1 (plus the number of started threads), no + // external references to the DLL exist anymore, threads can be stopped + // and runtime termination and DLL unload can be invoked via FreeLibraryAndExitThread. + // Note: runtime termination is then performed by a different thread than at startup. + // + // Note: if the DLL is never unloaded, process termination kills all threads + // and signals their handles before unconditionally calling DllMain(DLL_PROCESS_DETACH). + + import core.sys.windows.windows : HMODULE, FreeLibraryAndExitThread, GetModuleHandleExW, + GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS, GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT; + import core.sys.windows.dll : dll_getRefCount; + + version(CRuntime_Microsoft) + extern(C) extern __gshared ubyte msvcUsesUCRT; // from rt/msvc.c + package(core) bool thread_DLLProcessDetaching; + __gshared HMODULE ll_dllModule; + __gshared ThreadID ll_dllMonitorThread; + + int ll_countLowLevelThreadsWithDLLUnloadCallback() nothrow + { + lowlevelLock.lock_nothrow(); + scope(exit) lowlevelLock.unlock_nothrow(); + + int cnt = 0; + foreach (i; 0 .. ll_nThreads) + if (ll_pThreads[i].cbDllUnload) + cnt++; + return cnt; + } + + bool ll_dllHasExternalReferences() nothrow + { + version (CRuntime_DigitalMars) + enum internalReferences = 1; // only the watchdog thread + else + int internalReferences = msvcUsesUCRT ? 1 + ll_countLowLevelThreadsWithDLLUnloadCallback() : 1; + + int refcnt = dll_getRefCount(ll_dllModule); + return refcnt > internalReferences; + } + + private void monitorDLLRefCnt() nothrow + { + // this thread keeps the DLL alive until all external references are gone + while (ll_dllHasExternalReferences()) + { + Thread.sleep(100.msecs); + } + + // the current thread will be terminated below + ll_removeThread(GetCurrentThreadId()); + + for (;;) + { + ThreadID tid; + void delegate() nothrow cbDllUnload; + { + lowlevelLock.lock_nothrow(); + scope(exit) lowlevelLock.unlock_nothrow(); + + foreach (i; 0 .. ll_nThreads) + if (ll_pThreads[i].cbDllUnload) + { + cbDllUnload = ll_pThreads[i].cbDllUnload; + tid = ll_pThreads[0].tid; + } + } + if (!cbDllUnload) + break; + cbDllUnload(); + assert(!findLowLevelThread(tid)); + } + + FreeLibraryAndExitThread(ll_dllModule, 0); + } + + int ll_getDLLRefCount() nothrow @nogc + { + if (!ll_dllModule && + !GetModuleHandleExW(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, + cast(const(wchar)*) &ll_getDLLRefCount, &ll_dllModule)) + return -1; + return dll_getRefCount(ll_dllModule); + } + + bool ll_startDLLUnloadThread() nothrow @nogc + { + int refcnt = ll_getDLLRefCount(); + if (refcnt < 0) + return false; // not a dynamically loaded DLL + + if (ll_dllMonitorThread !is ThreadID.init) + return true; + + // if a thread is created from a DLL, the MS runtime (starting with VC2015) increments the DLL reference count + // to avoid the DLL being unloaded while the thread is still running. Mimick this behavior here for all + // runtimes not doing this + version (CRuntime_DigitalMars) + enum needRef = true; + else + bool needRef = !msvcUsesUCRT; + + if (needRef) + { + HMODULE hmod; + GetModuleHandleExW(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS, cast(const(wchar)*) &ll_getDLLRefCount, &hmod); + } + + ll_dllMonitorThread = createLowLevelThread(() { monitorDLLRefCnt(); }); + return ll_dllMonitorThread != ThreadID.init; + } +} + /** * Create a thread not under control of the runtime, i.e. TLS module constructors are * not run and the GC does not suspend it during a collection @@ -5687,11 +5835,15 @@ version (Windows) * dg = delegate to execute in the created thread * stacksize = size of the stack of the created thread. The default of 0 will select the * platform-specific default size + * cbDllUnload = Windows only: if running in a dynamically loaded DLL, this delegate will be called + * if the DLL is supposed to be unloaded, but the thread is still running. + * The thread must be terminated via `joinLowLevelThread` by the callback. * * Returns: the platform specific thread ID of the new thread. If an error occurs, `ThreadID.init` * is returned. */ -ThreadID createLowLevelThread(void delegate() nothrow dg, uint stacksize = 0) nothrow @nogc +ThreadID createLowLevelThread(void delegate() nothrow dg, uint stacksize = 0, + void delegate() nothrow cbDllUnload = null) nothrow @nogc { void delegate() nothrow* context = cast(void delegate() nothrow*)malloc(dg.sizeof); *context = dg; @@ -5720,14 +5872,18 @@ ThreadID createLowLevelThread(void delegate() nothrow dg, uint stacksize = 0) no scope(exit) lowlevelLock.unlock_nothrow(); ll_nThreads++; - ll_pThreads = cast(ThreadID*)realloc(ll_pThreads, Thread.sizeof * ll_nThreads); + ll_pThreads = cast(ll_ThreadData*)realloc(ll_pThreads, ll_ThreadData.sizeof * ll_nThreads); version (Windows) { - ll_pThreads[ll_nThreads - 1] = tid; + ll_pThreads[ll_nThreads - 1].tid = tid; + ll_pThreads[ll_nThreads - 1].cbDllUnload = cbDllUnload; if (ResumeThread(hThread) == -1) onThreadError("Error resuming thread"); CloseHandle(hThread); + + if (cbDllUnload) + ll_startDLLUnloadThread(); } else version (Posix) { @@ -5751,7 +5907,7 @@ ThreadID createLowLevelThread(void delegate() nothrow dg, uint stacksize = 0) no if ((rc = pthread_create(&tid, &attr, &thread_lowlevelEntry, context)) != 0) return ThreadID.init; - ll_pThreads[ll_nThreads - 1] = tid; + ll_pThreads[ll_nThreads - 1].tid = tid; } return tid; } @@ -5808,7 +5964,7 @@ bool findLowLevelThread(ThreadID tid) nothrow @nogc scope(exit) lowlevelLock.unlock_nothrow(); foreach (i; 0 .. ll_nThreads) - if (tid is ll_pThreads[i]) + if (tid is ll_pThreads[i].tid) return true; return false; } diff --git a/src/rt/msvc.c b/src/rt/msvc.c index 06d77e19d5..3d77e7d35c 100644 --- a/src/rt/msvc.c +++ b/src/rt/msvc.c @@ -36,6 +36,8 @@ int _set_output_format(int format); // VS2013- //extern const char* __acrt_iob_func; extern const char* _nullfunc = 0; +unsigned char msvcUsesUCRT; + #if defined _M_IX86 #define C_PREFIX "_" #elif defined _M_X64 || defined _M_ARM || defined _M_ARM64 @@ -58,6 +60,7 @@ void init_msvc() stdin = __acrt_iob_func(0); stdout = __acrt_iob_func(1); stderr = __acrt_iob_func(2); + msvcUsesUCRT = 1; } else if (&__iob_func != (void*) &_nullfunc) { diff --git a/test/shared/src/dllgc.d b/test/shared/src/dllgc.d new file mode 100644 index 0000000000..ff46b094ea --- /dev/null +++ b/test/shared/src/dllgc.d @@ -0,0 +1,63 @@ +version(DLL) +{ + import core.sys.windows.dll; + import core.memory; + import core.thread; + import core.sync.event; + + mixin SimpleDllMain; + + class Task + { + bool stop; + Event event; + ThreadID tid; + + this() + { + event.initialize(true, false); + tid = createLowLevelThread(&run, 0, &term); + } + + void run() nothrow + { + while (!stop) + { + event.wait(100.msecs); + } + } + void term() nothrow + { + stop = true; + event.set(); + joinLowLevelThread(tid); + } + } + + static this() + { + auto tsk = new Task; + } +} +else +{ + void main() + { + import core.runtime; + import core.time; + import core.thread; + import core.sys.windows.windows : GetModuleHandleA; + + auto dll = Runtime.loadLibrary("dllgc.dll"); + assert(dll); + Runtime.unloadLibrary(dll); + // the DLL might not be unloaded immiediately, but should do so eventually + for (int i = 0; i < 100; i++) + { + if (!GetModuleHandleA("dllgc.dll")) + return; + Thread.sleep(10.msecs); + } + assert(false); + } +} diff --git a/test/shared/src/dllrefcount.d b/test/shared/src/dllrefcount.d new file mode 100644 index 0000000000..dfcc45a471 --- /dev/null +++ b/test/shared/src/dllrefcount.d @@ -0,0 +1,20 @@ + +import core.sys.windows.dll; +import core.runtime; + +void main() +{ + auto kernel32 = Runtime.loadLibrary("kernel32.dll"); + assert(kernel32); + int refcnt = dll_getRefCount(kernel32); + assert(refcnt == -1); + + auto imagehlp = Runtime.loadLibrary("imagehlp.dll"); + assert(imagehlp); + refcnt = dll_getRefCount(imagehlp); + assert(refcnt == 1); + + Runtime.unloadLibrary(imagehlp); + refcnt = dll_getRefCount(imagehlp); + assert(refcnt == -2); +} diff --git a/test/shared/win64.mak b/test/shared/win64.mak new file mode 100644 index 0000000000..8535d93d32 --- /dev/null +++ b/test/shared/win64.mak @@ -0,0 +1,23 @@ +# built from the druntime top-level folder +# to be overwritten by caller +DMD=dmd +MODEL=64 +DRUNTIMELIB=druntime64.lib + +test: loadlibwin dllrefcount dllgc + +dllrefcount: + $(DMD) -g -m$(MODEL) -conf= -Isrc -defaultlib=$(DRUNTIMELIB) test\shared\src\dllrefcount.d + dllrefcount.exe + del dllrefcount.exe dllrefcount.obj + +loadlibwin: + $(DMD) -g -m$(MODEL) -conf= -Isrc -defaultlib=$(DRUNTIMELIB) test\shared\src\loadlibwin.d + loadlibwin.exe + del loadlibwin.exe loadlibwin.obj + +dllgc: + $(DMD) -g -m$(MODEL) -conf= -Isrc -defaultlib=$(DRUNTIMELIB) -version=DLL -shared -ofdllgc.dll test\shared\src\dllgc.d + $(DMD) -g -m$(MODEL) -conf= -Isrc -defaultlib=$(DRUNTIMELIB) -ofloaddllgc.exe test\shared\src\dllgc.d + loaddllgc.exe + del loaddllgc.exe loaddllgc.obj dllgc.dll dllgc.obj diff --git a/win32.mak b/win32.mak index 9052ac5858..51f368938b 100644 --- a/win32.mak +++ b/win32.mak @@ -117,6 +117,17 @@ test_aa: test_hash: $(DMD) -m$(MODEL) -conf= -Isrc -defaultlib=$(DRUNTIME) -run test\hash\src\test_hash.d +test_gc: + "$(MAKE)" -f test\gc\win64.mak "DMD=$(DMD)" MODEL=$(MODEL) "VCDIR=$(VCDIR)" DRUNTIMELIB=$(DRUNTIME) "CC=$(CC)" test + +custom_gc: + $(MAKE) -f test\init_fini\win64.mak "DMD=$(DMD)" MODEL=$(MODEL) "VCDIR=$(VCDIR)" DRUNTIMELIB=$(DRUNTIME) "CC=$(CC)" test + +test_shared: + $(MAKE) -f test\shared\win64.mak "DMD=$(DMD)" MODEL=$(MODEL) "VCDIR=$(VCDIR)" DRUNTIMELIB=$(DRUNTIME) "CC=$(CC)" test + +test_all: test_aa test_hash test_gc custom_gc test_shared + ################### zip/install/clean ########################## zip: druntime.zip @@ -136,4 +147,4 @@ clean: auto-tester-build: target -auto-tester-test: unittest test_aa test_hash +auto-tester-test: unittest test_all diff --git a/win64.mak b/win64.mak index e408774c5b..4f74033f11 100644 --- a/win64.mak +++ b/win64.mak @@ -111,10 +111,10 @@ test_gc: custom_gc: $(MAKE) -f test\init_fini\win64.mak "DMD=$(DMD)" MODEL=$(MODEL) "VCDIR=$(VCDIR)" DRUNTIMELIB=$(DRUNTIME) "CC=$(CC)" test -test_loadlib: - "$(DMD)" -m$(MODEL) -conf= -Isrc -defaultlib=$(DRUNTIME) -run test\shared\src\loadlibwin.d +test_shared: + $(MAKE) -f test\shared\win64.mak "DMD=$(DMD)" MODEL=$(MODEL) "VCDIR=$(VCDIR)" DRUNTIMELIB=$(DRUNTIME) "CC=$(CC)" test -test_all: test_uuid test_aa test_hash test_stdcpp test_gc custom_gc test_loadlib +test_all: test_shared test_uuid test_aa test_hash test_stdcpp test_gc custom_gc ################### zip/install/clean ########################## From bc662c30c65943874261297a3753040e321ff248 Mon Sep 17 00:00:00 2001 From: Rainer Schuetze Date: Sun, 21 Apr 2019 12:30:03 +0200 Subject: [PATCH 2/7] workaround symbol lookup issue --- src/core/thread.d | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/core/thread.d b/src/core/thread.d index d012d6b8e5..9b966b7223 100644 --- a/src/core/thread.d +++ b/src/core/thread.d @@ -5720,11 +5720,12 @@ private // Note: if the DLL is never unloaded, process termination kills all threads // and signals their handles before unconditionally calling DllMain(DLL_PROCESS_DETACH). - import core.sys.windows.windows : HMODULE, FreeLibraryAndExitThread, GetModuleHandleExW, + import core.sys.windows.winbase : FreeLibraryAndExitThread, GetModuleHandleExW, GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS, GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT; + import core.sys.windows.windef : HMODULE; import core.sys.windows.dll : dll_getRefCount; - version(CRuntime_Microsoft) + version (CRuntime_Microsoft) extern(C) extern __gshared ubyte msvcUsesUCRT; // from rt/msvc.c package(core) bool thread_DLLProcessDetaching; From 0ac33a395effc2c77c531f468b9d8954cf5b82c7 Mon Sep 17 00:00:00 2001 From: Rainer Schuetze Date: Mon, 22 Apr 2019 09:51:09 +0200 Subject: [PATCH 3/7] do not start thread during DLL_PROCESS_DETACH --- src/core/thread.d | 6 +++++- test/shared/src/dllgc.d | 8 ++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/core/thread.d b/src/core/thread.d index 9b966b7223..613db7b07f 100644 --- a/src/core/thread.d +++ b/src/core/thread.d @@ -5728,7 +5728,7 @@ private version (CRuntime_Microsoft) extern(C) extern __gshared ubyte msvcUsesUCRT; // from rt/msvc.c - package(core) bool thread_DLLProcessDetaching; + package(core) __gshared bool thread_DLLProcessDetaching; __gshared HMODULE ll_dllModule; __gshared ThreadID ll_dllMonitorThread; @@ -5852,6 +5852,10 @@ ThreadID createLowLevelThread(void delegate() nothrow dg, uint stacksize = 0, ThreadID tid; version (Windows) { + // the thread won't start until after the DLL is unloaded + if (thread_DLLProcessDetaching) + return ThreadID.init; + static extern (Windows) uint thread_lowlevelEntry(void* ctx) nothrow { auto dg = *cast(void delegate() nothrow*)ctx; diff --git a/test/shared/src/dllgc.d b/test/shared/src/dllgc.d index ff46b094ea..aca97891f0 100644 --- a/test/shared/src/dllgc.d +++ b/test/shared/src/dllgc.d @@ -37,6 +37,14 @@ version(DLL) static this() { auto tsk = new Task; + assert(tsk.tid != ThreadID.init); + } + + static ~this() + { + // creating thread in shutdown should fail + auto tsk = new Task; + assert(tsk.tid == ThreadID.init); } } else From d4be4c615e2ff488ba4cf801ab6814d7905502f4 Mon Sep 17 00:00:00 2001 From: Rainer Schuetze Date: Thu, 21 Mar 2019 09:33:25 +0100 Subject: [PATCH 4/7] GC: add parallel marking --- changelog/gc_parallel.dd | 19 ++ src/core/gc/config.d | 5 +- src/gc/bits.d | 57 ++++++ src/gc/impl/conservative/gc.d | 324 +++++++++++++++++++++++++++++++--- 4 files changed, 375 insertions(+), 30 deletions(-) create mode 100644 changelog/gc_parallel.dd diff --git a/changelog/gc_parallel.dd b/changelog/gc_parallel.dd new file mode 100644 index 0000000000..2113479be7 --- /dev/null +++ b/changelog/gc_parallel.dd @@ -0,0 +1,19 @@ +GC now marks the heap with multiple threads + +The garbage collector now uses available CPU cores to mark the heap +faster. This reduces pause times for a collection considerably. + +By default, the GC uses all available logical cores of your CPU. This +might affect your application if it has threads that are not suspended +during the mark phase of the collection. You can configure the number of +additional threads used for marking by DRT option `parallel` to the +$(LINK2 $(ROOT_DIR)spec/garbage.html, GC configuration), +e.g. by passing `--DRT-gcopt=parallel:2` on the command +line. A value of `0` disables parallel marking completely. + +As usual, you can also embed the configuration into the application by +redefining `rt_options`, e.g. + +------- +extern(C) __gshared string[] rt_options = [ "gcopt=parallel:0" ]; +------- diff --git a/src/core/gc/config.d b/src/core/gc/config.d index 6536f7e8b2..cf6ec189a4 100644 --- a/src/core/gc/config.d +++ b/src/core/gc/config.d @@ -22,6 +22,7 @@ struct Config size_t minPoolSize = 1; // initial and minimum pool size (MB) size_t maxPoolSize = 64; // maximum pool size (MB) size_t incPoolSize = 3; // pool size increment (MB) + uint parallel = 99; // number of additional threads for marking (limited by cpuid.threadsPerCPU-1) float heapSizeFactor = 2.0; // heap size to used memory ratio string cleanup = "collect"; // select gc cleanup method none|collect|finalize @@ -51,11 +52,13 @@ struct Config minPoolSize:N - initial and minimum pool size in MB (%lld) maxPoolSize:N - maximum pool size in MB (%lld) incPoolSize:N - pool size increment MB (%lld) + parallel:N - number of additional threads for marking (%lld) heapSizeFactor:N - targeted heap size to used memory ratio (%g) cleanup:none|collect|finalize - how to treat live objects when terminating (collect) ".ptr, cast(long)initReserve, cast(long)minPoolSize, - cast(long)maxPoolSize, cast(long)incPoolSize, heapSizeFactor); + cast(long)maxPoolSize, cast(long)incPoolSize, + cast(long)parallel, heapSizeFactor); } string errorName() @nogc nothrow { return "GC"; } diff --git a/src/gc/bits.d b/src/gc/bits.d index 406a70f564..f50d46d4c9 100644 --- a/src/gc/bits.d +++ b/src/gc/bits.d @@ -85,6 +85,63 @@ struct GCBits return core.bitop.btr(data, i); } + // return non-zero if bit already set + size_t setLocked(size_t i) nothrow + { + version (D_InlineAsm_X86) + { + asm @nogc nothrow { + naked; // assume RAX=this, [esp+4]=i + mov ECX, data[EAX]; + mov EDX,[ESP+4]; + lock; + bts dword ptr[ECX], EDX; + sbb EAX,EAX; + ret 4; + } + } + else version (D_InlineAsm_X86_64) + { + asm @nogc nothrow { + naked; // assume RCX=this, RDX=i + mov RAX, data[RCX]; + lock; + bts qword ptr[RAX], RDX; + sbb RAX,RAX; + ret; + } + } + else + { + auto pos = i >> BITS_SHIFT; + auto pdata = cast(shared)(data + pos); + auto mask = BITS_1 << (i & BITS_MASK); + auto state = *pdata; + if (state & mask) + return state; + + import core.atomic; + auto newstate = state | mask; + while (!cas(pdata, state, newstate)) + { + state = *pdata; + if (state & mask) + return state; + newstate = state | mask; + } + return 0; + } + } + + template testAndSet(bool locked) + { + static if (locked) + alias testAndSet = setLocked; + else + alias testAndSet = set; + } + + mixin template RangeVars() { size_t firstWord = (target >> BITS_SHIFT); diff --git a/src/gc/impl/conservative/gc.d b/src/gc/impl/conservative/gc.d index d326e154e5..7512a65915 100644 --- a/src/gc/impl/conservative/gc.d +++ b/src/gc/impl/conservative/gc.d @@ -18,6 +18,7 @@ module gc.impl.conservative.gc; /************** Debugging ***************************/ //debug = PRINTF; // turn on printf's +//debug = PARALLEL_PRINTF; // turn on printf's //debug = COLLECT_PRINTF; // turn on printf's //debug = MARK_PRINTF; // turn on printf's //debug = PRINTF_TO_FILE; // redirect printf's ouptut to file "gcx.log" @@ -32,6 +33,7 @@ module gc.impl.conservative.gc; //debug = PROFILE_API; // profile API calls for config.profile > 1 /***************************************************/ +version = COLLECT_PARALLEL; // parallel scanning import gc.bits; import gc.os; @@ -1304,6 +1306,9 @@ struct Gcx pauseTime, maxPause, apitxt.ptr); } + version (COLLECT_PARALLEL) + stopScanThreads(); + debug(INVARIANT) initialized = false; for (size_t i = 0; i < npools; i++) @@ -1865,6 +1870,7 @@ struct Gcx { nothrow: @disable this(this); + auto stackLock = shared(AlignedSpinLock)(SpinLock.Contention.brief); void reset() { @@ -1876,6 +1882,10 @@ struct Gcx } _cap = 0; } + void clear() + { + _length = 0; + } void push(RANGE rng) { @@ -1890,6 +1900,19 @@ struct Gcx return _p[--_length]; } + bool popLocked(ref RANGE rng) + { + if (_length == 0) + return false; + + stackLock.lock(); + scope(exit) stackLock.unlock(); + if (_length == 0) + return false; + rng = _p[--_length]; + return true; + } + ref inout(RANGE) opIndex(size_t idx) inout in { assert(idx < _length); } do @@ -1926,28 +1949,24 @@ struct Gcx ToScanStack!(ScanRange!false) toscanConservative; ToScanStack!(ScanRange!true) toscanPrecise; + template scanStack(bool precise) + { + static if (precise) + alias scanStack = toscanPrecise; + else + alias scanStack = toscanConservative; + } + /** * Search a range of memory values and mark any pointers into the GC pool. */ - void mark(bool precise)(void *pbot, void *ptop) scope nothrow + private void mark(bool precise, bool parallel)(ScanRange!precise rng) scope nothrow { - static if (precise) - alias toscan = toscanPrecise; - else - alias toscan = toscanConservative; + alias toscan = scanStack!precise; debug(MARK_PRINTF) printf("marking range: [%p..%p] (%#llx)\n", pbot, ptop, cast(long)(ptop - pbot)); - if (pbot >= ptop) - return; - - ScanRange!precise rng = void; - rng.pbot = cast(void **)pbot; - rng.ptop = cast(void **)ptop; - static if (precise) - rng.pbase = null; // always starting from a non-heap root - // limit the amount of ranges added to the toscan stack enum FANOUT_LIMIT = 32; size_t stackPos; @@ -2025,7 +2044,7 @@ struct Gcx biti = offsetBase >> Pool.ShiftBy.Small; //debug(PRINTF) printf("\t\tbiti = x%x\n", biti); - if (!pool.mark.set(biti) && !pool.noscan.test(biti)) + if (!pool.mark.testAndSet!parallel(biti) && !pool.noscan.test(biti)) { tgt.pbot = pool.baseAddr + offsetBase; tgt.ptop = tgt.pbot + binsize[bin]; @@ -2052,7 +2071,7 @@ struct Gcx if (tgt.pbot != sentinel_sub(p) && pool.nointerior.nbits && pool.nointerior.test(biti)) goto LnextPtr; - if (!pool.mark.set(biti) && !pool.noscan.test(biti)) + if (!pool.mark.testAndSet!parallel(biti) && !pool.noscan.test(biti)) { tgt.ptop = tgt.pbot + (cast(LargeObjectPool*)pool).getSize(pn); goto LaddLargeRange; @@ -2067,7 +2086,7 @@ struct Gcx if (pool.nointerior.nbits && pool.nointerior.test(biti)) goto LnextPtr; - if (!pool.mark.set(biti) && !pool.noscan.test(biti)) + if (!pool.mark.testAndSet!parallel(biti) && !pool.noscan.test(biti)) { tgt.pbot = pool.baseAddr + (pn * PAGESIZE); tgt.ptop = tgt.pbot + (cast(LargeObjectPool*)pool).getSize(pn); @@ -2125,15 +2144,21 @@ struct Gcx // pop range from local stack and recurse rng = stack[--stackPos]; } - else if (!toscan.empty) - { - // pop range from global stack and recurse - rng = toscan.pop(); - } else { - // nothing more to do - break; + static if (parallel) + { + if (!toscan.popLocked(rng)) + break; // nothing more to do + } + else + { + if (toscan.empty) + break; // nothing more to do + + // pop range from global stack and recurse + rng = toscan.pop(); + } } // printf(" pop [%p..%p] (%#zx)\n", p1, p2, cast(size_t)p2 - cast(size_t)p1); goto LcontRange; @@ -2148,6 +2173,11 @@ struct Gcx stackPos++; continue; } + static if (parallel) + { + toscan.stackLock.lock(); + scope(exit) toscan.stackLock.unlock(); + } toscan.push(rng); // reverse order for depth-first-order traversal foreach_reverse (ref range; stack) @@ -2165,12 +2195,31 @@ struct Gcx void markConservative(void *pbot, void *ptop) scope nothrow { - mark!false(pbot, ptop); + if (pbot < ptop) + mark!(false, false)(ScanRange!false(pbot, ptop)); } void markPrecise(void *pbot, void *ptop) scope nothrow { - mark!true(pbot, ptop); + if (pbot < ptop) + mark!(true, false)(ScanRange!true(pbot, ptop, null)); + } + + version (COLLECT_PARALLEL) + ToScanStack!(void*) toscanRoots; + + version (COLLECT_PARALLEL) + void collectRoots(void *pbot, void *ptop) scope nothrow + { + const minAddr = pooltable.minAddr; + size_t memSize = pooltable.maxAddr - minAddr; + + for (auto p = cast(void**)pbot; cast(void*)p < ptop; p++) + { + auto ptr = *p; + if (cast(size_t)(ptr - minAddr) < memSize) + toscanRoots.push(ptr); + } } // collection step 1: prepare freebits and mark bits @@ -2216,6 +2265,32 @@ struct Gcx //log--; } + version (COLLECT_PARALLEL) + void collectAllRoots(bool nostack) nothrow + { + if (!nostack) + { + debug(COLLECT_PRINTF) printf("\tcollect stacks.\n"); + // Scan stacks and registers for each paused thread + thread_scanAll(&collectRoots); + } + + // Scan roots[] + debug(COLLECT_PRINTF) printf("\tcollect roots[]\n"); + foreach (root; roots) + { + toscanRoots.push(root); + } + + // Scan ranges[] + debug(COLLECT_PRINTF) printf("\tcollect ranges[]\n"); + foreach (range; ranges) + { + debug(COLLECT_PRINTF) printf("\t\t%p .. %p\n", range.pbot, range.ptop); + collectRoots(range.pbot, range.ptop); + } + } + // collection step 3: finalize unreferenced objects, recover full pages with no live objects size_t sweep() nothrow { @@ -2521,10 +2596,23 @@ struct Gcx prepTime += (stop - start); start = stop; - if (ConservativeGC.isPrecise) - markAll!markPrecise(nostack); + version (COLLECT_PARALLEL) + bool doParallel = config.parallel > 0; else - markAll!markConservative(nostack); + enum doParallel = false; + + if (doParallel) + { + version (COLLECT_PARALLEL) + markParallel(nostack); + } + else + { + if (ConservativeGC.isPrecise) + markAll!markPrecise(nostack); + else + markAll!markConservative(nostack); + } thread_processGCMarks(&isMarked); thread_resumeAll(); @@ -2606,6 +2694,179 @@ struct Gcx } return IsMarked.unknown; } + + /* ============================ Parallel scanning =============================== */ + version (COLLECT_PARALLEL): + import core.sync.event; + import core.atomic; + private: // disable invariants for background threads + + static struct ScanThreadData + { + ThreadID tid; + } + uint numScanThreads; + ScanThreadData* scanThreadData; + + Event evStart; + Event evDone; + + shared uint busyThreads; + bool stopGC; + + void markParallel(bool nostack) nothrow + { + toscanRoots.clear(); + collectAllRoots(nostack); + if (toscanRoots.empty) + return; + + void** pbot = toscanRoots._p; + void** ptop = toscanRoots._p + toscanRoots._length; + + if (!scanThreadData) + startScanThreads(); + + debug(PARALLEL_PRINTF) printf("markParallel\n"); + + size_t pointersPerThread = toscanRoots._length / (numScanThreads + 1); + if (pointersPerThread > 0) + { + void pushRanges(bool precise)() + { + alias toscan = scanStack!precise; + toscan.stackLock.lock(); + + for (int idx = 0; idx < numScanThreads; idx++) + { + toscan.push(ScanRange!precise(pbot, pbot + pointersPerThread)); + pbot += pointersPerThread; + } + toscan.stackLock.unlock(); + } + if (ConservativeGC.isPrecise) + pushRanges!true(); + else + pushRanges!false(); + } + assert(pbot < ptop); + + busyThreads.atomicOp!"+="(1); // main thread is busy + + evStart.set(); + + debug(PARALLEL_PRINTF) printf("mark %lld roots\n", cast(ulong)(ptop - pbot)); + + if (ConservativeGC.isPrecise) + mark!(true, true)(ScanRange!true(pbot, ptop, null)); + else + mark!(false, true)(ScanRange!false(pbot, ptop)); + + busyThreads.atomicOp!"-="(1); + + debug(PARALLEL_PRINTF) printf("waitForScanDone\n"); + pullFromScanStack(); + debug(PARALLEL_PRINTF) printf("waitForScanDone done\n"); + } + + void startScanThreads() nothrow + { + import core.cpuid; + auto threads = threadsPerCPU(); + debug(PARALLEL_PRINTF) printf("startScanThreads: %d threads per CPU\n", threads); + if (threads <= 1) + return; // either core.cpuid not initialized or single core + + numScanThreads = threads >= config.parallel ? config.parallel : threads - 1; + + scanThreadData = cast(ScanThreadData*) cstdlib.calloc(numScanThreads, ScanThreadData.sizeof); + if (!scanThreadData) + onOutOfMemoryErrorNoGC(); + + evStart.initialize(false, false); + evDone.initialize(false, false); + + for (int idx = 0; idx < numScanThreads; idx++) + scanThreadData[idx].tid = createLowLevelThread(&scanBackground, 0x4000); + } + + void stopScanThreads() nothrow + { + if (!scanThreadData) + return; + + debug(PARALLEL_PRINTF) printf("stopScanThreads\n"); + stopGC = true; + evStart.set(); + + for (int idx = 0; idx < numScanThreads; idx++) + { + if (scanThreadData[idx].tid != scanThreadData[idx].tid.init) + { + joinLowLevelThread(scanThreadData[idx].tid); + scanThreadData[idx].tid = scanThreadData[idx].tid.init; + } + } + + evDone.terminate(); + evStart.terminate(); + + cstdlib.free(scanThreadData); + scanThreadData = null; + numScanThreads = 0; + + debug(PARALLEL_PRINTF) printf("stopScanThreads done\n"); + } + + void scanBackground() nothrow + { + while (!stopGC) + { + evStart.wait(dur!"msecs"(10)); + pullFromScanStack(); + evDone.set(); + } + } + + void pullFromScanStack() nothrow + { + if (ConservativeGC.isPrecise) + pullFromScanStackImpl!true(); + else + pullFromScanStackImpl!false(); + } + + void pullFromScanStackImpl(bool precise)() nothrow + { + if (atomicLoad(busyThreads) == 0) + return; + + debug(PARALLEL_PRINTF) + pthread_t threadId = pthread_self(); + debug(PARALLEL_PRINTF) printf("scanBackground thread %d start\n", threadId); + + ScanRange!precise rng; + alias toscan = scanStack!precise; + + while (atomicLoad(busyThreads) > 0) + { + if (toscan.empty) + { + evDone.wait(dur!"msecs"(1)); + continue; + } + + busyThreads.atomicOp!"+="(1); + if (toscan.popLocked(rng)) + { + debug(PARALLEL_PRINTF) printf("scanBackground thread %d scanning range [%p,%lld] from stack\n", threadId, + rng.pbot, cast(long) (rng.ptop - rng.pbot)); + mark!(precise, true)(rng); + } + busyThreads.atomicOp!"-="(1); + } + debug(PARALLEL_PRINTF) printf("scanBackground thread %d done\n", threadId); + } } /* ============================ Pool =============================== */ @@ -3610,9 +3871,14 @@ debug(PRINTF_TO_FILE) private __gshared MonoTime gcStartTick; private __gshared FILE* gcx_fh; private __gshared bool hadNewline = false; + import core.internal.spinlock; + static printLock = shared(AlignedSpinLock)(SpinLock.Contention.lengthy); private int printf(ARGS...)(const char* fmt, ARGS args) nothrow { + printLock.lock(); + scope(exit) printLock.unlock(); + if (!gcx_fh) gcx_fh = fopen("gcx.log", "w"); if (!gcx_fh) From 9144eb5b16a4d9bc4648532d394f2f8c456ca05c Mon Sep 17 00:00:00 2001 From: Rainer Schuetze Date: Sat, 30 Mar 2019 14:14:10 +0100 Subject: [PATCH 5/7] gc.bits.setLocked: use asm version on Windows only --- src/gc/bits.d | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/src/gc/bits.d b/src/gc/bits.d index f50d46d4c9..8387b10a09 100644 --- a/src/gc/bits.d +++ b/src/gc/bits.d @@ -88,28 +88,33 @@ struct GCBits // return non-zero if bit already set size_t setLocked(size_t i) nothrow { - version (D_InlineAsm_X86) + version(Windows) { - asm @nogc nothrow { - naked; // assume RAX=this, [esp+4]=i - mov ECX, data[EAX]; - mov EDX,[ESP+4]; - lock; - bts dword ptr[ECX], EDX; - sbb EAX,EAX; - ret 4; + version (D_InlineAsm_X86) + { + asm @nogc nothrow { + naked; // assume RAX=this, [esp+4]=i + mov ECX, data[EAX]; + mov EDX,[ESP+4]; + lock; + bts dword ptr[ECX], EDX; + sbb EAX,EAX; + ret 4; + } } - } - else version (D_InlineAsm_X86_64) - { - asm @nogc nothrow { - naked; // assume RCX=this, RDX=i - mov RAX, data[RCX]; - lock; - bts qword ptr[RAX], RDX; - sbb RAX,RAX; - ret; + else version (D_InlineAsm_X86_64) + { + asm @nogc nothrow { + naked; // assume RCX=this, RDX=i + mov RAX, data[RCX]; + lock; + bts qword ptr[RAX], RDX; + sbb RAX,RAX; + ret; + } } + else + static assert(false, "unexpected Windows architecture"); } else { From 6565c4ed40d83715d03583e880959d8d5e92c276 Mon Sep 17 00:00:00 2001 From: Rainer Schuetze Date: Sun, 31 Mar 2019 09:25:16 +0200 Subject: [PATCH 6/7] GCBits.setLocked: use intrinsics for GDC and LDC --- src/gc/bits.d | 59 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/src/gc/bits.d b/src/gc/bits.d index 8387b10a09..caf8416a32 100644 --- a/src/gc/bits.d +++ b/src/gc/bits.d @@ -88,33 +88,44 @@ struct GCBits // return non-zero if bit already set size_t setLocked(size_t i) nothrow { - version(Windows) + version (GNU) { - version (D_InlineAsm_X86) - { - asm @nogc nothrow { - naked; // assume RAX=this, [esp+4]=i - mov ECX, data[EAX]; - mov EDX,[ESP+4]; - lock; - bts dword ptr[ECX], EDX; - sbb EAX,EAX; - ret 4; - } + import gcc.builtins; + const pos = i >> BITS_SHIFT; + const mask = BITS_1 << (i & BITS_MASK); + mixin("auto val = __atomic_fetch_or_" ~ size_t.sizeof.stringof[0] + ~ "(cast(shared)(data + pos), mask, 3);"); + return (val & mask) != 0; + } + else version (LDC) + { + import ldc.intrinsics; + const pos = i >> BITS_SHIFT; + const mask = BITS_1 << (i & BITS_MASK); + auto val = llvm_atomic_rmw_or(cast(shared)(data + pos), mask); + return (val & mask) != 0; + } + else version (D_InlineAsm_X86) + { + asm @nogc nothrow { + mov EAX, this; + mov ECX, data[EAX]; + mov EDX, i; + lock; + bts dword ptr[ECX], EDX; + sbb EAX,EAX; } - else version (D_InlineAsm_X86_64) - { - asm @nogc nothrow { - naked; // assume RCX=this, RDX=i - mov RAX, data[RCX]; - lock; - bts qword ptr[RAX], RDX; - sbb RAX,RAX; - ret; - } + } + else version (D_InlineAsm_X86_64) + { + asm @nogc nothrow { + mov RAX, this; + mov RAX, data[RAX]; + mov RDX, i; + lock; + bts qword ptr[RAX], RDX; + sbb RAX,RAX; } - else - static assert(false, "unexpected Windows architecture"); } else { From 0155891f07e5af26b8e7492d91ad23e6b08ad3fa Mon Sep 17 00:00:00 2001 From: Rainer Schuetze Date: Sun, 21 Apr 2019 19:20:53 +0200 Subject: [PATCH 7/7] fix initialization of core.cpuid, use DLL unload callback --- src/gc/impl/conservative/gc.d | 39 +++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/src/gc/impl/conservative/gc.d b/src/gc/impl/conservative/gc.d index 7512a65915..06548777e7 100644 --- a/src/gc/impl/conservative/gc.d +++ b/src/gc/impl/conservative/gc.d @@ -2769,10 +2769,41 @@ struct Gcx debug(PARALLEL_PRINTF) printf("waitForScanDone done\n"); } - void startScanThreads() nothrow + int maxParallelThreads() nothrow { import core.cpuid; auto threads = threadsPerCPU(); + + if (threads == 0) + { + // If the GC is called by module ctors no explicit + // import dependency on the GC is generated. So the + // GC module is not correctly inserted into the module + // initialization chain. As it relies on core.cpuid being + // initialized, force this here. + try + { + foreach (m; ModuleInfo) + if (m.name == "core.cpuid") + if (auto ctor = m.ctor()) + { + ctor(); + threads = threadsPerCPU(); + break; + } + } + catch (Exception) + { + assert(false, "unexpected exception iterating ModuleInfo"); + } + } + return threads; + } + + + void startScanThreads() nothrow + { + auto threads = maxParallelThreads(); debug(PARALLEL_PRINTF) printf("startScanThreads: %d threads per CPU\n", threads); if (threads <= 1) return; // either core.cpuid not initialized or single core @@ -2787,12 +2818,12 @@ struct Gcx evDone.initialize(false, false); for (int idx = 0; idx < numScanThreads; idx++) - scanThreadData[idx].tid = createLowLevelThread(&scanBackground, 0x4000); + scanThreadData[idx].tid = createLowLevelThread(&scanBackground, 0x4000, &stopScanThreads); } void stopScanThreads() nothrow { - if (!scanThreadData) + if (!numScanThreads) return; debug(PARALLEL_PRINTF) printf("stopScanThreads\n"); @@ -2812,7 +2843,7 @@ struct Gcx evStart.terminate(); cstdlib.free(scanThreadData); - scanThreadData = null; + // scanThreadData = null; // keep non-null to not start again after shutdown numScanThreads = 0; debug(PARALLEL_PRINTF) printf("stopScanThreads done\n");