From: "gaoliming" <gaoliming@byosoft.com.cn>
To: <devel@edk2.groups.io>, <jeremy.compostella@intel.com>
Subject: 回复: [edk2-devel] [PATCH] BaseMemoryLibSse2: Take advantage of write combining buffers
Date: Tue, 13 Oct 2020 11:25:25 +0800 [thread overview]
Message-ID: <006801d6a110$7e0d5540$7a27ffc0$@byosoft.com.cn> (raw)
In-Reply-To: <87v9fj6shx.fsf@jcompost-mobl.ch.intel.com>
This is a good enhancement. The change is good to me.
Reviewed-by: Liming Gao <gaoliming@byosoft.com.cn>
Thanks
Liming
> -----邮件原件-----
> 发件人: bounce+27952+66093+4905953+8761045@groups.io
> <bounce+27952+66093+4905953+8761045@groups.io> 代表 Compostella,
> Jeremy
> 发送时间: 2020年10月10日 4:43
> 收件人: devel@edk2.groups.io
> 主题: [edk2-devel] [PATCH] BaseMemoryLibSse2: Take advantage of write
> combining buffers
>
> The current SSE2 implementation of the ZeroMem(), SetMem(),
> SetMem16(), SetMem32 and SetMem64 functions is writing 16 bytes per 16
> bytes. It hurts the performances so bad that this is even slower than
> a simple 'rep stos' (4% slower) in regular DRAM.
>
> To take full advantages of the 'movntdq' instruction it is better to
> "queue" a total of 64 bytes in the write combining buffers. This
> patch implement such a change. Below is a table where I measured
> (with 'rdtsc') the time to write an entire 100MB RAM buffer. These
> functions operate almost two times faster.
>
> | Function | Arch | Untouched | 64 bytes | Result |
> |----------+------+-----------+----------+--------|
> | ZeroMem | Ia32 | 17765947 | 9136062 | 1.945x |
> | ZeroMem | X64 | 17525170 | 9233391 | 1.898x |
> | SetMem | Ia32 | 17522291 | 9137272 | 1.918x |
> | SetMem | X64 | 17949261 | 9176978 | 1.956x |
> | SetMem16 | Ia32 | 18219673 | 9372062 | 1.944x |
> | SetMem16 | X64 | 17523331 | 9275184 | 1.889x |
> | SetMem32 | Ia32 | 18495036 | 9273053 | 1.994x |
> | SetMem32 | X64 | 17368864 | 9285885 | 1.870x |
> | SetMem64 | Ia32 | 18564473 | 9241362 | 2.009x |
> | SetMem64 | X64 | 17506951 | 9280148 | 1.886x |
>
> Signed-off-by: Jeremy Compostella <jeremy.compostella@intel.com>
> ---
> .../BaseMemoryLibSse2/Ia32/SetMem.nasm | 11 ++++++----
> .../BaseMemoryLibSse2/Ia32/SetMem16.nasm | 11 ++++++----
> .../BaseMemoryLibSse2/Ia32/SetMem32.nasm | 9 ++++++---
> .../BaseMemoryLibSse2/Ia32/SetMem64.nasm | 20
> +++++++++++++++----
> .../BaseMemoryLibSse2/Ia32/ZeroMem.nasm | 11 ++++++----
> .../Library/BaseMemoryLibSse2/X64/SetMem.nasm | 9 ++++++---
> .../BaseMemoryLibSse2/X64/SetMem16.nasm | 11 ++++++----
> .../BaseMemoryLibSse2/X64/SetMem32.nasm | 9 ++++++---
> .../BaseMemoryLibSse2/X64/SetMem64.nasm | 19
> ++++++++++++++----
> .../BaseMemoryLibSse2/X64/ZeroMem.nasm | 13 +++++++-----
> 10 files changed, 85 insertions(+), 38 deletions(-)
>
> diff --git a/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem.nasm
> b/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem.nasm
> index 24313cb4b3..a8744300c6 100644
> --- a/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem.nasm
> +++ b/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem.nasm
> @@ -34,7 +34,7 @@ ASM_PFX(InternalMemSetMem):
> mov al, [esp + 16] ; al <- Value
> xor ecx, ecx
> sub ecx, edi
> - and ecx, 15 ; ecx + edi aligns on 16-byte
> boundary
> + and ecx, 63 ; ecx + edi aligns on 16-byte
> boundary
> jz .0
> cmp ecx, edx
> cmova ecx, edx
> @@ -42,8 +42,8 @@ ASM_PFX(InternalMemSetMem):
> rep stosb
> .0:
> mov ecx, edx
> - and edx, 15
> - shr ecx, 4 ; ecx <- # of DQwords to set
> + and edx, 63
> + shr ecx, 6 ; ecx <- # of DQwords to set
> jz @SetBytes
> mov ah, al ; ax <- Value | (Value << 8)
> add esp, -16
> @@ -53,7 +53,10 @@ ASM_PFX(InternalMemSetMem):
> movlhps xmm0, xmm0 ; xmm0 <- Value repeats
> 16 times
> .1:
> movntdq [edi], xmm0 ; edi should be 16-byte
> aligned
> - add edi, 16
> + movntdq [edi + 16], xmm0
> + movntdq [edi + 32], xmm0
> + movntdq [edi + 48], xmm0
> + add edi, 64
> loop .1
> mfence
> movdqu xmm0, [esp] ; restore xmm0
> diff --git a/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem16.nasm
> b/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem16.nasm
> index 6e308b5594..d461ee086c 100644
> --- a/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem16.nasm
> +++ b/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem16.nasm
> @@ -33,7 +33,7 @@ ASM_PFX(InternalMemSetMem16):
> mov edi, [esp + 8]
> xor ecx, ecx
> sub ecx, edi
> - and ecx, 15 ; ecx + edi aligns on 16-byte
> boundary
> + and ecx, 63 ; ecx + edi aligns on 16-byte
> boundary
> mov eax, [esp + 16]
> jz .0
> shr ecx, 1
> @@ -43,15 +43,18 @@ ASM_PFX(InternalMemSetMem16):
> rep stosw
> .0:
> mov ecx, edx
> - and edx, 7
> - shr ecx, 3
> + and edx, 31
> + shr ecx, 5
> jz @SetWords
> movd xmm0, eax
> pshuflw xmm0, xmm0, 0
> movlhps xmm0, xmm0
> .1:
> movntdq [edi], xmm0 ; edi should be 16-byte
> aligned
> - add edi, 16
> + movntdq [edi + 16], xmm0
> + movntdq [edi + 32], xmm0
> + movntdq [edi + 48], xmm0
> + add edi, 64
> loop .1
> mfence
> @SetWords:
> diff --git a/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem32.nasm
> b/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem32.nasm
> index 2cfc8cb0dd..3ffdcd07d7 100644
> --- a/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem32.nasm
> +++ b/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem32.nasm
> @@ -43,14 +43,17 @@ ASM_PFX(InternalMemSetMem32):
> rep stosd
> .0:
> mov ecx, edx
> - and edx, 3
> - shr ecx, 2
> + and edx, 15
> + shr ecx, 4
> jz @SetDwords
> movd xmm0, eax
> pshufd xmm0, xmm0, 0
> .1:
> movntdq [edi], xmm0
> - add edi, 16
> + movntdq [edi + 16], xmm0
> + movntdq [edi + 32], xmm0
> + movntdq [edi + 48], xmm0
> + add edi, 64
> loop .1
> mfence
> @SetDwords:
> diff --git a/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem64.nasm
> b/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem64.nasm
> index e153495a68..cd000648ae 100644
> --- a/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem64.nasm
> +++ b/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem64.nasm
> @@ -38,17 +38,29 @@ ASM_PFX(InternalMemSetMem64):
> add edx, 8
> dec ecx
> .0:
> - shr ecx, 1
> + push ebx
> + mov ebx, ecx
> + and ebx, 7
> + shr ecx, 3
> jz @SetQwords
> movlhps xmm0, xmm0
> .1:
> movntdq [edx], xmm0
> - lea edx, [edx + 16]
> + movntdq [edx + 16], xmm0
> + movntdq [edx + 32], xmm0
> + movntdq [edx + 48], xmm0
> + lea edx, [edx + 64]
> loop .1
> mfence
> @SetQwords:
> - jnc .2
> + test ebx, ebx
> + jz .3
> + mov ecx, ebx
> +.2
> movq qword [edx], xmm0
> -.2:
> + lea edx, [edx + 8]
> + loop .2
> +.3:
> + pop ebx
> ret
>
> diff --git a/MdePkg/Library/BaseMemoryLibSse2/Ia32/ZeroMem.nasm
> b/MdePkg/Library/BaseMemoryLibSse2/Ia32/ZeroMem.nasm
> index cd34006f59..0e0828551b 100644
> --- a/MdePkg/Library/BaseMemoryLibSse2/Ia32/ZeroMem.nasm
> +++ b/MdePkg/Library/BaseMemoryLibSse2/Ia32/ZeroMem.nasm
> @@ -33,7 +33,7 @@ ASM_PFX(InternalMemZeroMem):
> xor ecx, ecx
> sub ecx, edi
> xor eax, eax
> - and ecx, 15
> + and ecx, 63
> jz .0
> cmp ecx, edx
> cmova ecx, edx
> @@ -41,13 +41,16 @@ ASM_PFX(InternalMemZeroMem):
> rep stosb
> .0:
> mov ecx, edx
> - and edx, 15
> - shr ecx, 4
> + and edx, 63
> + shr ecx, 6
> jz @ZeroBytes
> pxor xmm0, xmm0
> .1:
> movntdq [edi], xmm0
> - add edi, 16
> + movntdq [edi + 16], xmm0
> + movntdq [edi + 32], xmm0
> + movntdq [edi + 48], xmm0
> + add edi, 64
> loop .1
> mfence
> @ZeroBytes:
> diff --git a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem.nasm
> b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem.nasm
> index 5bd1c2262d..28b11ee586 100644
> --- a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem.nasm
> +++ b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem.nasm
> @@ -42,8 +42,8 @@ ASM_PFX(InternalMemSetMem):
> rep stosb
> .0:
> mov rcx, rdx
> - and rdx, 15
> - shr rcx, 4
> + and rdx, 63
> + shr rcx, 6
> jz @SetBytes
> mov ah, al ; ax <- Value repeats twice
> movdqa [rsp + 0x10], xmm0 ; save xmm0
> @@ -52,7 +52,10 @@ ASM_PFX(InternalMemSetMem):
> movlhps xmm0, xmm0 ; xmm0 <- Value repeats
> 16 times
> .1:
> movntdq [rdi], xmm0 ; rdi should be 16-byte
> aligned
> - add rdi, 16
> + movntdq [rdi + 16], xmm0
> + movntdq [rdi + 32], xmm0
> + movntdq [rdi + 48], xmm0
> + add rdi, 64
> loop .1
> mfence
> movdqa xmm0, [rsp + 0x10] ; restore xmm0
> diff --git a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem16.nasm
> b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem16.nasm
> index 90d159820a..375be19313 100644
> --- a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem16.nasm
> +++ b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem16.nasm
> @@ -33,7 +33,7 @@ ASM_PFX(InternalMemSetMem16):
> mov r9, rdi
> xor rcx, rcx
> sub rcx, rdi
> - and rcx, 15
> + and rcx, 63
> mov rax, r8
> jz .0
> shr rcx, 1
> @@ -43,15 +43,18 @@ ASM_PFX(InternalMemSetMem16):
> rep stosw
> .0:
> mov rcx, rdx
> - and edx, 7
> - shr rcx, 3
> + and edx, 31
> + shr rcx, 5
> jz @SetWords
> movd xmm0, eax
> pshuflw xmm0, xmm0, 0
> movlhps xmm0, xmm0
> .1:
> movntdq [rdi], xmm0
> - add rdi, 16
> + movntdq [rdi + 16], xmm0
> + movntdq [rdi + 32], xmm0
> + movntdq [rdi + 48], xmm0
> + add rdi, 64
> loop .1
> mfence
> @SetWords:
> diff --git a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem32.nasm
> b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem32.nasm
> index 928e086889..5d12beaa9a 100644
> --- a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem32.nasm
> +++ b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem32.nasm
> @@ -43,14 +43,17 @@ ASM_PFX(InternalMemSetMem32):
> rep stosd
> .0:
> mov rcx, rdx
> - and edx, 3
> - shr rcx, 2
> + and edx, 15
> + shr rcx, 4
> jz @SetDwords
> movd xmm0, eax
> pshufd xmm0, xmm0, 0
> .1:
> movntdq [rdi], xmm0
> - add rdi, 16
> + movntdq [rdi + 16], xmm0
> + movntdq [rdi + 32], xmm0
> + movntdq [rdi + 48], xmm0
> + add rdi, 64
> loop .1
> mfence
> @SetDwords:
> diff --git a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem64.nasm
> b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem64.nasm
> index d771810542..265983d5ad 100644
> --- a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem64.nasm
> +++ b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem64.nasm
> @@ -37,17 +37,28 @@ ASM_PFX(InternalMemSetMem64):
> add rdx, 8
> dec rcx
> .0:
> - shr rcx, 1
> + push rbx
> + mov rbx, rcx
> + and rbx, 7
> + shr rcx, 3
> jz @SetQwords
> movlhps xmm0, xmm0
> .1:
> movntdq [rdx], xmm0
> - lea rdx, [rdx + 16]
> + movntdq [rdx + 16], xmm0
> + movntdq [rdx + 32], xmm0
> + movntdq [rdx + 48], xmm0
> + lea rdx, [rdx + 64]
> loop .1
> mfence
> @SetQwords:
> - jnc .2
> - mov [rdx], r8
> + push rdi
> + mov rcx, rbx
> + mov rax, r8
> + mov rdi, rdx
> + rep stosq
> + pop rdi
> .2:
> + pop rbx
> ret
>
> diff --git a/MdePkg/Library/BaseMemoryLibSse2/X64/ZeroMem.nasm
> b/MdePkg/Library/BaseMemoryLibSse2/X64/ZeroMem.nasm
> index 5ddcae9ca5..21f504e3b7 100644
> --- a/MdePkg/Library/BaseMemoryLibSse2/X64/ZeroMem.nasm
> +++ b/MdePkg/Library/BaseMemoryLibSse2/X64/ZeroMem.nasm
> @@ -32,7 +32,7 @@ ASM_PFX(InternalMemZeroMem):
> xor rcx, rcx
> xor eax, eax
> sub rcx, rdi
> - and rcx, 15
> + and rcx, 63
> mov r8, rdi
> jz .0
> cmp rcx, rdx
> @@ -41,13 +41,16 @@ ASM_PFX(InternalMemZeroMem):
> rep stosb
> .0:
> mov rcx, rdx
> - and edx, 15
> - shr rcx, 4
> + and edx, 63
> + shr rcx, 6
> jz @ZeroBytes
> pxor xmm0, xmm0
> .1:
> - movntdq [rdi], xmm0 ; rdi should be 16-byte
> aligned
> - add rdi, 16
> + movntdq [rdi], xmm0
> + movntdq [rdi + 16], xmm0
> + movntdq [rdi + 32], xmm0
> + movntdq [rdi + 48], xmm0
> + add rdi, 64
> loop .1
> mfence
> @ZeroBytes:
> --
> 2.25.3
>
>
>
>
>
next prev parent reply other threads:[~2020-10-13 3:25 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-10-09 20:42 [PATCH] BaseMemoryLibSse2: Take advantage of write combining buffers Compostella, Jeremy
2020-10-13 3:25 ` gaoliming [this message]
2020-10-15 17:47 ` 回复: [edk2-devel] " Compostella, Jeremy
2020-10-16 0:59 ` 回复: " gaoliming
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-list from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to='006801d6a110$7e0d5540$7a27ffc0$@byosoft.com.cn' \
--to=devel@edk2.groups.io \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox