News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

Modified JWasm

Started by habran, March 01, 2012, 02:42:17 AM

Previous topic - Next topic

johnsa

With both of the above the stack reservation should be ? (If I'm not mistaken)

LOCALS:
MSG(48 bytes)
HWND(8 bytes)
WNDCLASSEX(80 bytes)

PARAMS:
Maximum is 12 (min would be 4) so 12*8: 96bytes

That gives me... 232 bytes (as opposed to 224 OR 288)


habran

It should be 240 and it is 0xF0
because 232 is 0xE8 which is bad alignment thus it is added 8 bites to 240
and that is what I get when I look at disassembly  with WinDbg
I don't know how did you get 224 except if you used the version which you compiled
please, use the precompiled  version I posted here and do not need to use "AND RSP, -16"
because Jwasm takes care of stack alignment

VS2010 with 288 exaggerated, that is why we love assembly language

Japheth's 2.07 source is not availible

However, as soon as it becomes I will modify it and post it (if maestro Japheth doesn't come with something better than that)




Greenhorn__

Quote from: johnsa on March 05, 2012, 09:11:09 AM
In fact switching the C code between CreateWindow and CreateWindowEx (which has an extra param) still leaves the RSP subtraction at 120h (288).
So it would appear that it rounds it up to some sort of multiple?

Hi,

CreateWindow is not a function, it's a macro.  :wink

#define CreateWindowA(lpClassName, lpWindowName, dwStyle, x, y,\
nWidth, nHeight, hWndParent, hMenu, hInstance, lpParam)\
CreateWindowExA(0L, lpClassName, lpWindowName, dwStyle, x, y,\
nWidth, nHeight, hWndParent, hMenu, hInstance, lpParam)
#define CreateWindowW(lpClassName, lpWindowName, dwStyle, x, y,\
nWidth, nHeight, hWndParent, hMenu, hInstance, lpParam)\
CreateWindowExW(0L, lpClassName, lpWindowName, dwStyle, x, y,\
nWidth, nHeight, hWndParent, hMenu, hInstance, lpParam)
#ifdef UNICODE
#define CreateWindow  CreateWindowW
#else
#define CreateWindow  CreateWindowA
#endif // !UNICODE



Regards
Greenhorn
You can fool some of the people all of the time, and all of the people some of the time, but you can not fool all of the people all of the time.
(Abraham Lincoln)

johnsa

I get 224 bytes... using the pre-compiled jwasm.exe you've supplied.

built with:

jwasm -c -Zi -Zf -Zd -win64 test64.asm
link /machine:x64 /subsystem:windows /entry:WinMainCRTStartup /debug /Libpath:"C:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\Lib\x64" test64.obj

Dis-asm listing when debugged with VS2010...

000000013F0F1087  mov         qword ptr [rsp+8],rcx 
000000013F0F108C  mov         qword ptr [rsp+10h],rdx 
000000013F0F1091  mov         qword ptr [rsp+18h],r8 
000000013F0F1096  mov         qword ptr [rsp+20h],r9 
000000013F0F109B  push        rbp 
000000013F0F109C  mov         rbp,rsp 
000000013F0F109F  sub         rsp,0E0h 


Source code is as posted for the example app.

johnsa

just to make it more clear (I obviously removed the shadow entries from the source):


WinMain proc FRAME hInst:HINSTANCE, hPrevInst:HINSTANCE, CmdLine:LPSTR, CmdShow:UINT

local wc:WNDCLASSEXA
local msg:MSG
local hwnd:HWND

    mov   hInst, rcx
000000013F091087  mov         qword ptr [rsp+8],rcx 
000000013F09108C  mov         qword ptr [rsp+10h],rdx 
000000013F091091  mov         qword ptr [rsp+18h],r8 
000000013F091096  mov         qword ptr [rsp+20h],r9 
000000013F09109B  push        rbp 
000000013F09109C  mov         rbp,rsp 
000000013F09109F  sub         rsp,0E0h 
000000013F0910A6  mov         qword ptr [rbp+10h],rcx 
    mov   wc.cbSize, SIZEOF WNDCLASSEXA
000000013F0910AA  mov         dword ptr [rbp-50h],50h 

habran

try this : -c -win64 -Zp8 -D_WIN64 -I -Zi -Zd

-Zp8 is 8 byte alignment and without it it comes to E0 with it is F0

habran

here are modifications to JWasm sources


file:
globals.h
line: 675


/* global variables */
extern char                 maxparams;
extern char *CurrMaxParams[];


file:
invoke.c
line: 28

extern char maxparams;  // thiss is needed here only for PellesC

line: 148

static void ms64_fcstart( struct dsym *proc, int numparams, int start, struct asm_tok tokenarray[], int *value )
/**************************************************************************************************************/
{
    /* v2.04: VARARG didn't work */
    if ( proc->e.procinfo->is_vararg ) {
        //numparams = ( tokenarray[start].token != T_FINAL ? 1 : 0 );
        for ( numparams = 0; tokenarray[start].token != T_FINAL; start++ )
            if ( tokenarray[start].token == T_COMMA )
                numparams++;
    }
    DebugMsg1(("ms64_fcstart(%s, numparams=%u) vararg=%u\n", proc->sym.name, numparams, proc->e.procinfo->is_vararg ));
    if ( numparams < 4 )
        numparams = 4;
    else if ( numparams & 1 )
        numparams++;
    *value = numparams;
if (numparams > maxparams ) maxparams = numparams;   //if there is more arguments ajust maxparams
    //AddLineQueueX( " sub %r, %d", T_RSP, numparams * 8 ); here we prevent changing the stack
    return;
}
static void ms64_fcend( struct dsym *proc, int numparams, int value )
/*******************************************************************/
{
    /* use <value>, which has been set by ms64_fcstart() */
    //AddLineQueueX( " add %r, %d", T_RSP, value * 8 ); here ve prevent changing the stack
   
return;
}


file:
proc.c

line: 77

char maxparams;                        /*max number of parameters in invoke calls */
char maxparams1;                       /*max number of parameters in invoke calls to ajust even */
char *CurrMaxParams[0x1000];  /*this array holds max number of shadow spaces
/*parameters for each functiuon,*/
 /*enough to acomodate 4096 functions, you can change it if you know what are you doing */

line: 420

#if AMD64_SUPPORT
    /* adjust start displacement for Win64 FRAME procs.
     * v2.06: the list may contain xmm registers, which have size 16!
     */
    if ( info->isframe ) {
        uint_16 *regs = info->regslist;
        int sizestd = 0;
        int sizexmm = 0;
        if ( regs )
            for( cnt = *regs++; cnt; cnt--, regs++ )
                if ( GetValueSp( *regs ) & OP_XMM )
                    sizexmm += 16;
                else
                    sizestd += 8;
        displ = sizexmm + sizestd;
        if (( sizestd & 0xf ) && sizexmm)
            displ += 8;
    }
#endif

line: 1980

static ret_code write_win64_default_prologue( struct proc_info *info )
/********************************************************************/
{
    uint_16             *regist;
    int                 sizestd = 0;
    int                 sizexmm = 0;

    DebugMsg1(("write_win64_default_prologue enter\n"));
    PushLineQueue();

    if ( ModuleInfo.win64_saveparams )
        win64_SaveRegParams( info );
    /*
     * PUSH RBP
     * .PUSHREG RBP
     * MOV RBP, RSP
     * .SETFRAME RBP, 0
     */
    AddLineQueueX( "push %r", T_RBP );
    AddLineQueueX( "%r %r", T_DOT_PUSHREG, T_RBP );
    AddLineQueueX( "mov %r, %r", T_RBP, T_RSP );
    AddLineQueueX( "%r %r, 0", T_DOT_SETFRAME, T_RBP );

    /* after the "push rbp", the stack is xmmword aligned */

    /* Push the registers */
    if( info->regslist ) {
        int cnt;
        regist = info->regslist;
        for( cnt = *regist++; cnt; cnt--, regist++ ) {
            //int i;
            if ( GetValueSp( *regist ) & OP_XMM ) {
                sizexmm += 16;
            } else {
                sizestd += 8;
                AddLineQueueX( "push %r", *regist );
                if ( ( 1 << GetRegNo( *regist ) ) & win64_nvgpr ) {
                    AddLineQueueX( "%r %r", T_DOT_PUSHREG, *regist );
                }
            }
        } /* end for */

        DebugMsg1(("write_win64_default_prologue: sizestd=%u, sizexmm=%u\n", sizestd, sizexmm ));
        sizestd &= 0xF; /* result will be 8 or 0. Just this amount is needed below */
#if 1
        /* save xmm registers */
        if ( sizexmm ) {
            int i;
            AddLineQueueX( "sub %r, %d", T_RSP, NUMQUAL sizexmm + sizestd );
            AddLineQueueX( "%r %d", T_DOT_ALLOCSTACK, NUMQUAL sizexmm + sizestd );
            sizestd = 0; /* stack is aligned now. Don't use sizestd anymore */
            regist = info->regslist;
            for( cnt = *regist++, i = 0; cnt; cnt--, regist++ ) {
                if ( GetValueSp( *regist ) & OP_XMM ) {
                    AddLineQueueX( "movdqa [%r+%u], %r", T_RSP, NUMQUAL i, *regist );
                    if ( ( 1 << GetRegNo( *regist ) ) & win64_nvxmm )  {
                        AddLineQueueX( "%r %r, %u", T_DOT_SAVEXMM128, *regist, NUMQUAL i );
                    }
                    i += 16;
                }
            }
        }
#endif
    }
    info->localsize = ROUND_UP( info->localsize, CurrWordSize );
    /* alloc space for local variables and align the stack. */
maxparams1 = (char)CurrMaxParams[procidx];
if( info->localsize + sizestd || maxparams1 ) {
        /* align the stack if necessary. */
        if ( ( sizestd && (!(info->localsize & 0xF ) ) ) ||
            ( sizestd == 0 && (info->localsize & 0xF ) ) )
            info->localsize += 8;
  if(maxparams1 & 1) maxparams1++;
  if(maxparams1 < 4) maxparams1 += 4;

        DebugMsg1(("write_win64_default_prologue: localsize=%u, sizestd=%u\n", info->localsize, sizestd ));

        /*
         * SUB  RSP, localsize
         * .ALLOCSTACK localsize
         */

        AddLineQueueX( "sub %r, %d", T_RSP, NUMQUAL info->localsize + (maxparams1 * 8));
        AddLineQueueX( "%r %d", T_DOT_ALLOCSTACK, NUMQUAL info->localsize + (maxparams1 * 8));
    }

    AddLineQueueX( "%r", T_DOT_ENDPROLOG );
 
 

line: 2559

static void write_win64_default_epilogue( struct proc_info *info )
/****************************************************************/
{
    uint sizexmm = 0;
    uint sizestd = 0;
    /* restore non-volatile xmm registers */
    if ( info->regslist ) {
        uint_16 *regist = info->regslist;
        int cnt;
        for( cnt = *regist++; cnt; cnt--, regist++ ) {
            if ( GetValueSp( *regist ) & OP_XMM ) {
                AddLineQueueX( "movdqa %r, [%r+%u]", *regist, stackreg[ModuleInfo.Ofssize], NUMQUAL info->localsize + sizexmm );
                sizexmm += 16;
            } else
                sizestd += 8;
        }
    }
    sizestd &= 0xf;
    /* v2.06: must match alignment of prologue! */
    if ( sizexmm && sizestd ) sizexmm += sizestd;
    //sprintf( buffer, "add %s, %d", GetResWName( stackreg[ModuleInfo.Ofssize], NULL ), info->localsize + sizexmm + sizestd );
    AddLineQueueX( "add %r, %d", stackreg[ModuleInfo.Ofssize], NUMQUAL info->localsize + sizexmm + (maxparams1 * 8));
CurrMaxParams[procidx] = (char*) maxparams;  //at the end of the function we have corect number of params in maxparams
maxparams = 0;              //reset maxparams for the next function
pop_register( CurrProc->e.procinfo->regslist );
    AddLineQueueX( "pop %r", basereg[ModuleInfo.Ofssize] );
return;
}






habran

and here is again JWasm.exe

johnsa

Assembling with Zp8 does indeed produce 0f0h (240)
so basically it's padding any structure member less than a qword up to a qword when used as a local (on the stack) to ensure correct alignment of each member variable. Which is great.

All we need now is an option to ensure that the address (RSP or RBP as used) for a particular stack variable can be aligned. This should then ensure that we can use movaps, movdqa for any local SIMD var.
I'm not sure how people feel about the syntax of this, I think it could be specified on the LOCAL itself or on the struct definition?
I tend to prefer on the local, as it means I'm free to use the struct elsewhere without the alignment and we can create a LOCAL using a different form.

IE:

LOCAL MyVector:_m128:ALIGN 16

or

LOCAL  MyVector[4]:DWORD:ALIGN 16

That and Codeview v8 support.. and I think 64bit asm is just about set!


habran

If you look at this "static ret_code write_win64_default_prologue( struct proc_info *info )" function you will see that stack is being aligned to 16 bytes
so your wishes have been fulfilled
there is only  Codeview v8 support pending and you will be fully satisfied
However, we have to wait for japheth to come back for that
It will probably take long time for him to finish that, but for now we have pretty good tools to work with
this version satisfy me fully and I am happy to have it

best regards


johnsa

Hey,

The stack is aligned, but what happens with the prologue generation if I were to do this:


MyProc proc a:DWORD, b:DWORD

LOCAL a:_mm128
LOCAL b:REAL4
LOCAL c:_mm128

movdqa a,xmm0
movss b,xmm1
movaps c,xmm2

ret
MyProc endp


for example.. is it ensuring that a's address is 16 byte aligned, AND c, noting that it has a 4 byte value stuck in-between? If it's doing that already... then SERIOUS hats off to you sir :)

johnsa

I guess as a work-around if need be.. you could just ensure you create all the LOCALs which need to be aligned at the beginning.. IE:

LOCAL a:_mm128
LOCAL b:_mm128
LOCAL c:REAL4

that way each is 16 bytes in sized which should maintain the stack alignment?

habran

how does it look to you? OK?



__mm128i struct
i0 DWORD ?
i1 DWORD ?
i2 DWORD ?
i3 DWORD ?
__mm128i ends

_mm128i typedef __mm128i

__mm128f struct
f0 real4 ?
f1 real4 ?
f2 real4 ?
f3 real4 ?
__mm128f ends

_mm128f typedef __mm128f

_mm128 union
i32 _mm128i <>
f32 _mm128f <>
_mm128 ends


MyProc proc ar:DWORD, br:DWORD
LOCAL a:_mm128
LOCAL b:REAL4
LOCAL ci:_mm128

movdqa a,xmm0
movss b,xmm1
movaps ci,xmm2
ret
MyProc endp

proctest!MyProc:
00000001`40001040 48894c2408      mov     qword ptr [rsp+8],rcx
00000001`40001045 4889542410      mov     qword ptr [rsp+10h],rdx
00000001`4000104a 55              push    rbp
00000001`4000104b 488bec          mov     rbp,rsp
00000001`4000104e 4883ec28        sub     rsp,28h
00000001`40001052 660f7f45f0      movdqa  xmmword ptr [rbp-10h],xmm0
00000001`40001057 f30f114dec      movss   dword ptr [rbp-14h],xmm1
00000001`4000105c 0f2955d8        movaps  xmmword ptr [rbp-28h],xmm2
00000001`40001060 c9              leave
00000001`40001061 c3              ret


but it causes access violation

MyProc proc ar:DWORD, br:DWORD
LOCAL a:_mm128
LOCAL ci:_mm128
LOCAL b:REAL4

movdqa a,xmm0
movss b,xmm1
movaps ci,xmm2
ret
MyProc endp

this is OK

johnsa

It looks lovely :) but.. i get an access violation when i run it..


MyProc proc ar:DWORD, br:DWORD
000000013F1E1248  mov         qword ptr [rsp+8],rcx 
000000013F1E124D  mov         qword ptr [rsp+10h],rdx 
000000013F1E1252  push        rbp 
000000013F1E1253  mov         rbp,rsp 
000000013F1E1256  sub         rsp,28h 
LOCAL a:_mm128
LOCAL b:REAL4
LOCAL ci:_mm128

movdqa a,xmm0
000000013F1E125A  movdqa      xmmword ptr [rbp-10h],xmm0 
movss b,xmm1
000000013F1E125F  movss       dword ptr [rbp-14h],xmm1 
movaps ci,xmm2
000000013F1E1264  movaps      xmmword ptr [rbp-28h],xmm2              ; <-- Access Violation Here.. rbp-28h isn't 16 aligned.
ret
000000013F1E1268  leave 
000000013F1E1269  ret 


I've asked Japheth to please integrate your changes based on this thread into 2.07, and i'm just waiting for him to come back on what debug data is currently generated in the COFF OBJ, I'm assuming its the older V4 format?
In which case what his time lines are looking like. Perhaps it would be worth the effort to build a separate utility to update the OBJ to be at least more V8 compatible if it's going to take him too long.

johnsa

Swapping b and ci around fixes it.. 28h = 40 which isn't / 16 .. so if RBP is aligned to 16, RBP-28h can't be.