The following C function attempts to prevent recursion in multicore code in a thread-safe manner using a thread local storage variable. However, for reasons that are somewhat complicated, I NEED to write this function in X64 assembler (Intel X86 / AMD 64-bit) and assemble it with ml64.exe from VC2010. I know how to do this if I'm using global variables but I'm not sure how to do it properly with a TLS variable that has __declspec(thread).
__declspec(thread) int tls_VAR = 0;
void norecurse( )
{
if(0==tls_VAR)
{
tls_VAR=1;
DoWork();
tls_VAR=0;
}
}
Note: This is what VC2010 kicks out for the function if I request a listing file:
; Listing generated by Microsoft (R) Optimizing Compiler Version 16.00.40219.01
include listing.inc
INCLUDELIB MSVCRTD
INCLUDELIB OLDNAMES
PUBLIC norecurse
EXTRN DoWork:PROC
EXTRN tls_VAR:DWORD
EXTRN _tls_index:DWORD
pdata SEGMENT
$pdata$norecurse DD imagerel $LN4
DD imagerel $LN4+70
DD imagerel $unwind$norecurse
pdata ENDS
xdata SEGMENT
$unwind$norecurse DD 040a01H
DD 06340aH
DD 07006320aH
; Function compile flags: /Ogtpy
xdata ENDS
_TEXT SEGMENT
norecurse PROC
; File p:\hackytests\64bittest2010\64bittest\64bittest.cpp
; Line 19
$LN4:
mov QWORD PTR [rsp+8], rbx
push rdi
sub rsp, 32 ; 00000020H
; Line 20
mov ecx, DWORD PTR _tls_index
mov rax, QWORD PTR gs:88
mov edi, OFFSET FLAT:tls_VAR
mov rbx, QWORD PTR [rax+rcx*8]
cmp DWORD PTR [rbx+rdi], 0
jne SHORT $LN1@norecurse
; Line 22
mov DWORD PTR [rbx+rdi], 1
; Line 23
call DoWork
; Line 24
mov DWORD PTR [rbx+rdi], 0
$LN1@norecurse:
; Line 26
mov rbx, QWORD PTR [rsp+48]
add rsp, 32 ; 00000020H
pop rdi
ret 0
norecurse ENDP
_TEXT ENDS
END
I was able to work a hack around the issue. My implementation in assember is less efficient than the C compiler generated code though because I was not able to figure out how to use the following two addressing modes:
mov rax, QWORD PTR gs:88
mov edi, OFFSET FLAT:tls_VAR
For (1), I had to load 88 into rax and use gs:[rax] to access the TLS-base for the thread.
For (2), the lack of OFFSET FLAT in MASM (ml64.exe) meant that I had to be more clever. I computed the offset by subtracting _tls_start from the TLS-base for the thread that could be applied to TLS-variables in assembler to access their thread local values.
So this is my hack implementation that I would like to improve / do correctly.
PUBLIC norecurse
EXTRN _tls_index:DWORD
EXTRN _tls_start:DWORD
EXTRN tls_VAR:DWORD
EXTRN DoWork:PROC
_TEXT SEGMENT
norecurse PROC
; non-volatile
push rbx
sub rsp,32
; The gs segment register refers to the base address of the TEB on x64.
; 88 (0×58) is the offset in the TEB for the ThreadLocalStoragePointer member on x64
mov rax,88
mov edx, DWORD PTR _tls_index
mov rax, gs:[rax]
mov r11, QWORD PTR [rax+rdx*8]
lea r10, _tls_start
; r11 will be the the offset-adjusted TLS-Base
sub r11, r10
; ebx will be the the thread local address of tls_VAR
lea rdx, tls_VAR
lea rbx,[r11+rdx]
cmp DWORD PTR [rbx], 0
jne @F
mov DWORD PTR [rbx], 1
call DoWork
mov DWORD PTR [rbx], 0
@@:
add rsp,32
pop rbx
ret
norecurse ENDP
_TEXT ENDS
END
I'd love to see more efficient method or pointers on how to actually use the two addressing modes I couldn't figure out with MASM (ml64.exe) though.