From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <leif.lindholm@linaro.org>
Received: from mail-wm0-x22c.google.com (mail-wm0-x22c.google.com
 [IPv6:2a00:1450:400c:c09::22c])
 (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
 (No client certificate requested)
 by ml01.01.org (Postfix) with ESMTPS id 05D431A1E00
 for <edk2-devel@lists.01.org>; Fri, 26 Aug 2016 05:54:44 -0700 (PDT)
Received: by mail-wm0-x22c.google.com with SMTP id f65so277274535wmi.0
 for <edk2-devel@lists.01.org>; Fri, 26 Aug 2016 05:54:43 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linaro.org; s=google;
 h=date:from:to:cc:subject:message-id:references:mime-version
 :content-disposition:in-reply-to:user-agent;
 bh=wIx+YYfMMqSasq5J11MLns5tdgs2G0VNJSJyRZ1nceE=;
 b=cdvesu4wd1MXWzahxgYy+CKywtiDzqiASTFJw/6I+/+xpw3/3r05F4M9ZcxBKaMgG8
 YN0m78tGr4/PlFGWsileGFK6biiNaFajZLuExk910yBP9Mp07gAvxIBw3DHXfS7r/t+L
 LDM2hg6yck6/DOm9LFJGytiOOYcQB/veZnrL0=
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20130820;
 h=x-gm-message-state:date:from:to:cc:subject:message-id:references
 :mime-version:content-disposition:in-reply-to:user-agent;
 bh=wIx+YYfMMqSasq5J11MLns5tdgs2G0VNJSJyRZ1nceE=;
 b=MaXPmxK3cN1rr6EYyHBh3KSjxeKAZDeD5Rul0ApcNQTKjYrTHvCJohalUTTAlB6W0z
 u1B3LFgpWYwKDxkSaPVdkePn3UP4xpxI+vk5xSUCfESLy6lSf00FrHIVecB/6/lD9GXl
 NNQw6v7Wg3fbW8bdRL9oP5jtKovMI4fyEHoJgbkxXIUoR/oPCxGc7PuBqXi9xTcv8aCL
 dKVM1ggV4Vk+91yxYVqaMMxOTop/XLevtp+Uc7USXQwP3DLfSV0/cLDq54CfFEf7flG3
 /JW2rECpbf/SwQGQ+IaNqFNiUmH8yPALYAYJYY47xf3SJNd8dxbycywRTJQZNZaYsnQl
 4dSw==
X-Gm-Message-State: AE9vXwNJ3zVqk2dPzk5dzU2neozandRLS+HT+aGZhV6czKKv5DkfKUDGK0Pjon1FzcV8HRsT
X-Received: by 10.194.16.65 with SMTP id e1mr3813902wjd.143.1472216082414;
 Fri, 26 Aug 2016 05:54:42 -0700 (PDT)
Received: from bivouac.eciton.net (bivouac.eciton.net.
 [2a00:1098:0:86:1000:23:0:2])
 by smtp.gmail.com with ESMTPSA id g1sm19950764wjy.5.2016.08.26.05.54.41
 (version=TLS1_2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128);
 Fri, 26 Aug 2016 05:54:41 -0700 (PDT)
Date: Fri, 26 Aug 2016 13:54:39 +0100
From: Leif Lindholm <leif.lindholm@linaro.org>
To: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: edk2-devel@lists.01.org, jbrasen@codeaurora.org, feng.tian@intel.com,
 star.zeng@intel.com, daniil.egranov@arm.com
Message-ID: <20160826125439.GF4715@bivouac.eciton.net>
References: <1471445945-19239-1-git-send-email-ard.biesheuvel@linaro.org>
 <1471445945-19239-4-git-send-email-ard.biesheuvel@linaro.org>
MIME-Version: 1.0
In-Reply-To: <1471445945-19239-4-git-send-email-ard.biesheuvel@linaro.org>
User-Agent: Mutt/1.5.23 (2014-03-12)
Subject: Re: [PATCH v3 3/4] MdeModulePkg/EbxDxe AARCH64: use tail call for EBC to native thunk
X-BeenThere: edk2-devel@lists.01.org
X-Mailman-Version: 2.1.21
Precedence: list
List-Id: EDK II Development  <edk2-devel.lists.01.org>
List-Unsubscribe: <https://lists.01.org/mailman/options/edk2-devel>,
 <mailto:edk2-devel-request@lists.01.org?subject=unsubscribe>
List-Archive: <http://lists.01.org/pipermail/edk2-devel/>
List-Post: <mailto:edk2-devel@lists.01.org>
List-Help: <mailto:edk2-devel-request@lists.01.org?subject=help>
List-Subscribe: <https://lists.01.org/mailman/listinfo/edk2-devel>,
 <mailto:edk2-devel-request@lists.01.org?subject=subscribe>
X-List-Received-Date: Fri, 26 Aug 2016 12:54:44 -0000
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

On Wed, Aug 17, 2016 at 04:59:04PM +0200, Ard Biesheuvel wrote:
> Instead of pessimistically copying at least 64 bytes from the VM stack
> to the native stack, and popping off the register arguments again
> before doing the native call, try to avoid touching the stack completely
> if the VM stack frame is < 64 bytes. Also, if the stack frame does exceed

Should this say "does not"?

> 64 bytes, there is no need to copy the first 64 bytes, since we are passing
> those in registers anyway.
> 
> Contributed-under: TianoCore Contribution Agreement 1.0
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> ---
>  MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S | 73 +++++++++++++++-----
>  1 file changed, 55 insertions(+), 18 deletions(-)
> 
> diff --git a/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S b/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S
> index cb7a70b5a4f8..d95713e82b0f 100644
> --- a/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S
> +++ b/MdeModulePkg/Universal/EbcDxe/AArch64/EbcLowLevel.S
> @@ -35,30 +35,67 @@ ASM_GLOBAL ASM_PFX(mEbcInstructionBufferTemplate)
>  //****************************************************************************
>  // UINTN EbcLLCALLEXNative(UINTN FuncAddr, UINTN NewStackPointer, VOID *FramePtr)
>  ASM_PFX(EbcLLCALLEXNative):
> -      stp  x19, x20, [sp, #-16]!
> -      stp  x29, x30, [sp, #-16]!
> +    mov     x8, x0                 // Preserve x0
> +    mov     x9, x1                 // Preserve x1
>  
> -      mov  x19, x0
> -      mov  x20, sp
> -      sub  x2, x2, x1   // Length = NewStackPointer-FramePtr
> -      sub  sp, sp, x2
> -      sub  sp, sp, #64  // Make sure there is room for at least 8 args in the new stack
> -      mov  x0, sp
> +    //
> +    // If the EBC stack frame is smaller than or equal to 64 bytes, we know there
> +    // are no stacked arguments #9 and beyond that we need to copy to the native
> +    // stack. In this case, we can perform a tail call which is much more
> +    // efficient, since there is no need to touch the native stack at all.
> +    //
> +    sub     x3, x2, x1              // Length = NewStackPointer - FramePtr
> +    cmp     x3, #64
> +    b.gt    1f
>  
> -      bl   CopyMem      // Sp, NewStackPointer, Length
> +    adr     x0, 0f
> +    sub     x0, x0, x3, lsr #1
> +    br      x0
>  
> -      ldp  x0, x1, [sp], #16
> -      ldp  x2, x3, [sp], #16
> -      ldp  x4, x5, [sp], #16
> -      ldp  x6, x7, [sp], #16
> +    ldr     x7, [x9, #56]
> +    ldr     x6, [x9, #48]
> +    ldr     x5, [x9, #40]
> +    ldr     x4, [x9, #32]
> +    ldr     x3, [x9, #24]
> +    ldr     x2, [x9, #16]
> +    ldr     x1, [x9, #8]
> +    ldr     x0, [x9]

Why not keep using ldp, but with x9?

>  
> -      blr  x19
> +0:  br      x8
>  
> -      mov  sp,  x20
> -      ldp  x29, x30, [sp], #16
> -      ldp  x19, x20, [sp], #16
> +    //
> +    // More than 64 bytes: we need to build the full native stack frame and copy
> +    // the part of the VM stack exceeding 64 bytes (which may contain stacked
> +    // arguments) to the native stack
> +    //
> +1:  stp     x29, x30, [sp, #-16]!
> +    mov     x29, sp
>  
> -      ret
> +    //
> +    // Ensure that the stack pointer remains 16 byte aligned,
> +    // even if the size of the VM stack frame is not a multiple of 16
> +    //
> +    add     x1, x1, #64             // Skip over [potential] reg params
> +    tbz     x3, #3, 2f              // Multiple of 16?
> +    ldr     x4, [x2, #-8]!          // No? Then push one word
> +    str     x4, [sp, #-16]!         // ... but use two slots
> +    b       3f
> +
> +2:  ldp     x4, x5, [x2, #-16]!
> +    stp     x4, x5, [sp, #-16]!
> +3:  cmp     x2, x1
> +    b.gt    2b
> +
> +    ldp     x0, x1, [x9]
> +    ldp     x2, x3, [x9, #16]
> +    ldp     x4, x5, [x9, #32]
> +    ldp     x6, x7, [x9, #48]
> +
> +    blr     x8
> +
> +    mov     sp, x29
> +    ldp     x29, x30, [sp], #16
> +    ret
>  
>  //****************************************************************************
>  // EbcLLEbcInterpret
> -- 
> 2.7.4
>