* [PATCH v5 2/4] MdePkg/BaseMemoryLibOptDxe: add accelerated ARM routines
2016-09-09 14:00 [PATCH v5 0/4] MdePkg: add ARM/AARCH64 support to BaseMemoryLib Ard Biesheuvel
2016-09-09 14:00 ` [PATCH v5 1/4] MdePkg/BaseMemoryLib: widen aligned accesses to 32 or 64 bits Ard Biesheuvel
@ 2016-09-09 14:00 ` Ard Biesheuvel
2016-09-09 14:00 ` [PATCH v5 3/4] MdePkg/BaseMemoryLibOptDxe: add accelerated AARCH64 routines Ard Biesheuvel
2016-09-09 14:00 ` [PATCH v5 4/4] MdePkg/BaseMemoryLibOptDxe ARM|AARCH64: disallow use in SEC & PEI phases Ard Biesheuvel
3 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2016-09-09 14:00 UTC (permalink / raw)
To: edk2-devel, liming.gao, leif.lindholm, michael.d.kinney; +Cc: Ard Biesheuvel
This adds ARM support to BaseMemoryLibOptDxe, partially based on the
cortex-strings library (ScanMem) and the existing CopyMem() implementation
from BaseMemoryLibStm in ArmPkg.
All string routines are accelerated except ScanMem16, ScanMem32,
ScanMem64 and IsZeroBuffer, which can wait for another day. (Very few
occurrences exist in the codebase)
Contributed-under: TianoCore Contribution Agreement 1.0
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Liming Gao <liming.gao@intel.com>
---
MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.S | 138 ++++++++++++++++
MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.asm | 140 ++++++++++++++++
MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.S | 172 ++++++++++++++++++++
MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.asm | 147 +++++++++++++++++
MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.S | 146 +++++++++++++++++
MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.asm | 147 +++++++++++++++++
MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMemGeneric.c | 142 ++++++++++++++++
MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.S | 77 +++++++++
MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.asm | 84 ++++++++++
MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf | 30 ++--
10 files changed, 1209 insertions(+), 14 deletions(-)
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.S
new file mode 100644
index 000000000000..951d15777a38
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.S
@@ -0,0 +1,138 @@
+//
+// Copyright (c) 2013 - 2016, Linaro Limited
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// * Neither the name of the Linaro nor the
+// names of its contributors may be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+// Parameters and result.
+#define src1 r0
+#define src2 r1
+#define limit r2
+#define result r0
+
+// Internal variables.
+#define data1 r3
+#define data2 r4
+#define limit_wd r5
+#define diff r6
+#define tmp1 r7
+#define tmp2 r12
+#define pos r8
+#define mask r14
+
+ .text
+ .thumb
+ .syntax unified
+ .align 5
+ASM_GLOBAL ASM_PFX(InternalMemCompareMem)
+ASM_PFX(InternalMemCompareMem):
+ push {r4-r8, lr}
+ eor tmp1, src1, src2
+ tst tmp1, #3
+ bne .Lmisaligned4
+ ands tmp1, src1, #3
+ bne .Lmutual_align
+ add limit_wd, limit, #3
+ nop.w
+ lsr limit_wd, limit_wd, #2
+
+ // Start of performance-critical section -- one 32B cache line.
+.Lloop_aligned:
+ ldr data1, [src1], #4
+ ldr data2, [src2], #4
+.Lstart_realigned:
+ subs limit_wd, limit_wd, #1
+ eor diff, data1, data2 // Non-zero if differences found.
+ cbnz diff, 0f
+ bne .Lloop_aligned
+ // End of performance-critical section -- one 32B cache line.
+
+ // Not reached the limit, must have found a diff.
+0: cbnz limit_wd, .Lnot_limit
+
+ // Limit % 4 == 0 => all bytes significant.
+ ands limit, limit, #3
+ beq .Lnot_limit
+
+ lsl limit, limit, #3 // Bits -> bytes.
+ mov mask, #~0
+ lsl mask, mask, limit
+ bic data1, data1, mask
+ bic data2, data2, mask
+
+ orr diff, diff, mask
+
+.Lnot_limit:
+ rev diff, diff
+ rev data1, data1
+ rev data2, data2
+
+ // The MS-non-zero bit of DIFF marks either the first bit
+ // that is different, or the end of the significant data.
+ // Shifting left now will bring the critical information into the
+ // top bits.
+ clz pos, diff
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+
+ // But we need to zero-extend (char is unsigned) the value and then
+ // perform a signed 32-bit subtraction.
+ lsr data1, data1, #28
+ sub result, data1, data2, lsr #28
+ pop {r4-r8, pc}
+
+.Lmutual_align:
+ // Sources are mutually aligned, but are not currently at an
+ // alignment boundary. Round down the addresses and then mask off
+ // the bytes that precede the start point.
+ bic src1, src1, #3
+ bic src2, src2, #3
+ add limit, limit, tmp1 // Adjust the limit for the extra.
+ lsl tmp1, tmp1, #2 // Bytes beyond alignment -> bits.
+ ldr data1, [src1], #4
+ neg tmp1, tmp1 // Bits to alignment -32.
+ ldr data2, [src2], #4
+ mov tmp2, #~0
+
+ // Little-endian. Early bytes are at LSB.
+ lsr tmp2, tmp2, tmp1 // Shift (tmp1 & 31).
+ add limit_wd, limit, #3
+ orr data1, data1, tmp2
+ orr data2, data2, tmp2
+ lsr limit_wd, limit_wd, #2
+ b .Lstart_realigned
+
+.Lmisaligned4:
+ sub limit, limit, #1
+1:
+ // Perhaps we can do better than this.
+ ldrb data1, [src1], #1
+ ldrb data2, [src2], #1
+ subs limit, limit, #1
+ it cs
+ cmpcs data1, data2
+ beq 1b
+ sub result, data1, data2
+ pop {r4-r8, pc}
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.asm b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.asm
new file mode 100644
index 000000000000..47b49ee16473
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.asm
@@ -0,0 +1,140 @@
+;
+; Copyright (c) 2013 - 2016, Linaro Limited
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in the
+; documentation and/or other materials provided with the distribution.
+; * Neither the name of the Linaro nor the
+; names of its contributors may be used to endorse or promote products
+; derived from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;
+
+; Parameters and result.
+#define src1 r0
+#define src2 r1
+#define limit r2
+#define result r0
+
+; Internal variables.
+#define data1 r3
+#define data2 r4
+#define limit_wd r5
+#define diff r6
+#define tmp1 r7
+#define tmp2 r12
+#define pos r8
+#define mask r14
+
+ EXPORT InternalMemCompareMem
+ THUMB
+ AREA CompareMem, CODE, READONLY
+
+InternalMemCompareMem
+ push {r4-r8, lr}
+ eor tmp1, src1, src2
+ tst tmp1, #3
+ bne Lmisaligned4
+ ands tmp1, src1, #3
+ bne Lmutual_align
+ add limit_wd, limit, #3
+ nop.w
+ lsr limit_wd, limit_wd, #2
+
+ ; Start of performance-critical section -- one 32B cache line.
+Lloop_aligned
+ ldr data1, [src1], #4
+ ldr data2, [src2], #4
+Lstart_realigned
+ subs limit_wd, limit_wd, #1
+ eor diff, data1, data2 ; Non-zero if differences found.
+ cbnz diff, L0
+ bne Lloop_aligned
+ ; End of performance-critical section -- one 32B cache line.
+
+ ; Not reached the limit, must have found a diff.
+L0
+ cbnz limit_wd, Lnot_limit
+
+ // Limit % 4 == 0 => all bytes significant.
+ ands limit, limit, #3
+ beq Lnot_limit
+
+ lsl limit, limit, #3 // Bits -> bytes.
+ mov mask, #~0
+ lsl mask, mask, limit
+ bic data1, data1, mask
+ bic data2, data2, mask
+
+ orr diff, diff, mask
+
+Lnot_limit
+ rev diff, diff
+ rev data1, data1
+ rev data2, data2
+
+ ; The MS-non-zero bit of DIFF marks either the first bit
+ ; that is different, or the end of the significant data.
+ ; Shifting left now will bring the critical information into the
+ ; top bits.
+ clz pos, diff
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+
+ ; But we need to zero-extend (char is unsigned) the value and then
+ ; perform a signed 32-bit subtraction.
+ lsr data1, data1, #28
+ sub result, data1, data2, lsr #28
+ pop {r4-r8, pc}
+
+Lmutual_align
+ ; Sources are mutually aligned, but are not currently at an
+ ; alignment boundary. Round down the addresses and then mask off
+ ; the bytes that precede the start point.
+ bic src1, src1, #3
+ bic src2, src2, #3
+ add limit, limit, tmp1 ; Adjust the limit for the extra.
+ lsl tmp1, tmp1, #2 ; Bytes beyond alignment -> bits.
+ ldr data1, [src1], #4
+ neg tmp1, tmp1 ; Bits to alignment -32.
+ ldr data2, [src2], #4
+ mov tmp2, #~0
+
+ ; Little-endian. Early bytes are at LSB.
+ lsr tmp2, tmp2, tmp1 ; Shift (tmp1 & 31).
+ add limit_wd, limit, #3
+ orr data1, data1, tmp2
+ orr data2, data2, tmp2
+ lsr limit_wd, limit_wd, #2
+ b Lstart_realigned
+
+Lmisaligned4
+ sub limit, limit, #1
+L1
+ // Perhaps we can do better than this.
+ ldrb data1, [src1], #1
+ ldrb data2, [src2], #1
+ subs limit, limit, #1
+ it cs
+ cmpcs data1, data2
+ beq L1
+ sub result, data1, data2
+ pop {r4-r8, pc}
+
+ END
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.S
new file mode 100644
index 000000000000..fb5293befc10
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.S
@@ -0,0 +1,172 @@
+#------------------------------------------------------------------------------
+#
+# CopyMem() worker for ARM
+#
+# This file started out as C code that did 64 bit moves if the buffer was
+# 32-bit aligned, else it does a byte copy. It also does a byte copy for
+# any trailing bytes. It was updated to do 32-byte copies using stm/ldm.
+#
+# Copyright (c) 2008 - 2010, Apple Inc. All rights reserved.<BR>
+# Copyright (c) 2016, Linaro Ltd. All rights reserved.<BR>
+# This program and the accompanying materials
+# are licensed and made available under the terms and conditions of the BSD License
+# which accompanies this distribution. The full text of the license may be found at
+# http://opensource.org/licenses/bsd-license.php
+#
+# THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,
+# WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
+#
+#------------------------------------------------------------------------------
+
+ .text
+ .thumb
+ .syntax unified
+
+/**
+ Copy Length bytes from Source to Destination. Overlap is OK.
+
+ This implementation
+
+ @param Destination Target of copy
+ @param Source Place to copy from
+ @param Length Number of bytes to copy
+
+ @return Destination
+
+
+VOID *
+EFIAPI
+InternalMemCopyMem (
+ OUT VOID *DestinationBuffer,
+ IN CONST VOID *SourceBuffer,
+ IN UINTN Length
+ )
+**/
+ASM_GLOBAL ASM_PFX(InternalMemCopyMem)
+ASM_PFX(InternalMemCopyMem):
+ push {r4-r11, lr}
+ // Save the input parameters in extra registers (r11 = destination, r14 = source, r12 = length)
+ mov r11, r0
+ mov r10, r0
+ mov r12, r2
+ mov r14, r1
+
+ cmp r11, r1
+ // If (dest < source)
+ bcc memcopy_check_optim_default
+
+ // If (source + length < dest)
+ rsb r3, r1, r11
+ cmp r12, r3
+ bcc memcopy_check_optim_default
+ b memcopy_check_optim_overlap
+
+memcopy_check_optim_default:
+ // Check if we can use an optimized path ((length >= 32) && destination word-aligned && source word-aligned) for the memcopy (optimized path if r0 == 1)
+ tst r0, #0xF
+ it ne
+ movne r0, #0
+ bne memcopy_default
+ tst r1, #0xF
+ ite ne
+ movne r3, #0
+ moveq r3, #1
+ cmp r2, #31
+ ite ls
+ movls r0, #0
+ andhi r0, r3, #1
+ b memcopy_default
+
+memcopy_check_optim_overlap:
+ // r10 = dest_end, r14 = source_end
+ add r10, r11, r12
+ add r14, r12, r1
+
+ // Are we in the optimized case ((length >= 32) && dest_end word-aligned && source_end word-aligned)
+ cmp r2, #31
+ ite ls
+ movls r0, #0
+ movhi r0, #1
+ tst r10, #0xF
+ it ne
+ movne r0, #0
+ tst r14, #0xF
+ it ne
+ movne r0, #0
+ b memcopy_overlapped
+
+memcopy_overlapped_non_optim:
+ // We read 1 byte from the end of the source buffer
+ sub r3, r14, #1
+ sub r12, r12, #1
+ ldrb r3, [r3, #0]
+ sub r2, r10, #1
+ cmp r12, #0
+ // We write 1 byte at the end of the dest buffer
+ sub r10, r10, #1
+ sub r14, r14, #1
+ strb r3, [r2, #0]
+ bne memcopy_overlapped_non_optim
+ b memcopy_end
+
+// r10 = dest_end, r14 = source_end
+memcopy_overlapped:
+ // Are we in the optimized case ?
+ cmp r0, #0
+ beq memcopy_overlapped_non_optim
+
+ // Optimized Overlapped - Read 32 bytes
+ sub r14, r14, #32
+ sub r12, r12, #32
+ cmp r12, #31
+ ldmia r14, {r2-r9}
+
+ // If length is less than 32 then disable optim
+ it ls
+ movls r0, #0
+
+ cmp r12, #0
+
+ // Optimized Overlapped - Write 32 bytes
+ sub r10, r10, #32
+ stmia r10, {r2-r9}
+
+ // while (length != 0)
+ bne memcopy_overlapped
+ b memcopy_end
+
+memcopy_default_non_optim:
+ // Byte copy
+ ldrb r3, [r14], #1
+ sub r12, r12, #1
+ strb r3, [r10], #1
+
+memcopy_default:
+ cmp r12, #0
+ beq memcopy_end
+
+// r10 = dest, r14 = source
+memcopy_default_loop:
+ cmp r0, #0
+ beq memcopy_default_non_optim
+
+ // Optimized memcopy - Read 32 Bytes
+ sub r12, r12, #32
+ cmp r12, #31
+ ldmia r14!, {r2-r9}
+
+ // If length is less than 32 then disable optim
+ it ls
+ movls r0, #0
+
+ cmp r12, #0
+
+ // Optimized memcopy - Write 32 Bytes
+ stmia r10!, {r2-r9}
+
+ // while (length != 0)
+ bne memcopy_default_loop
+
+memcopy_end:
+ mov r0, r11
+ pop {r4-r11, pc}
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.asm b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.asm
new file mode 100644
index 000000000000..2034807954d7
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.asm
@@ -0,0 +1,147 @@
+;------------------------------------------------------------------------------
+;
+; CopyMem() worker for ARM
+;
+; This file started out as C code that did 64 bit moves if the buffer was
+; 32-bit aligned, else it does a byte copy. It also does a byte copy for
+; any trailing bytes. It was updated to do 32-byte copies using stm/ldm.
+;
+; Copyright (c) 2008 - 2010, Apple Inc. All rights reserved.<BR>
+; Copyright (c) 2016, Linaro Ltd. All rights reserved.<BR>
+; This program and the accompanying materials
+; are licensed and made available under the terms and conditions of the BSD License
+; which accompanies this distribution. The full text of the license may be found at
+; http://opensource.org/licenses/bsd-license.php
+;
+; THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,
+; WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
+;
+;------------------------------------------------------------------------------
+
+ EXPORT InternalMemCopyMem
+ AREA SetMem, CODE, READONLY
+ THUMB
+
+InternalMemCopyMem
+ stmfd sp!, {r4-r11, lr}
+ // Save the input parameters in extra registers (r11 = destination, r14 = source, r12 = length)
+ mov r11, r0
+ mov r10, r0
+ mov r12, r2
+ mov r14, r1
+
+memcopy_check_overlapped
+ cmp r11, r1
+ // If (dest < source)
+ bcc memcopy_check_optim_default
+
+ // If (source + length < dest)
+ rsb r3, r1, r11
+ cmp r12, r3
+ bcc memcopy_check_optim_default
+ b memcopy_check_optim_overlap
+
+memcopy_check_optim_default
+ // Check if we can use an optimized path ((length >= 32) && destination word-aligned && source word-aligned) for the memcopy (optimized path if r0 == 1)
+ tst r0, #0xF
+ movne r0, #0
+ bne memcopy_default
+ tst r1, #0xF
+ movne r3, #0
+ moveq r3, #1
+ cmp r2, #31
+ movls r0, #0
+ andhi r0, r3, #1
+ b memcopy_default
+
+memcopy_check_optim_overlap
+ // r10 = dest_end, r14 = source_end
+ add r10, r11, r12
+ add r14, r12, r1
+
+ // Are we in the optimized case ((length >= 32) && dest_end word-aligned && source_end word-aligned)
+ cmp r2, #31
+ movls r0, #0
+ movhi r0, #1
+ tst r10, #0xF
+ movne r0, #0
+ tst r14, #0xF
+ movne r0, #0
+ b memcopy_overlapped
+
+memcopy_overlapped_non_optim
+ // We read 1 byte from the end of the source buffer
+ sub r3, r14, #1
+ sub r12, r12, #1
+ ldrb r3, [r3, #0]
+ sub r2, r10, #1
+ cmp r12, #0
+ // We write 1 byte at the end of the dest buffer
+ sub r10, r10, #1
+ sub r14, r14, #1
+ strb r3, [r2, #0]
+ bne memcopy_overlapped_non_optim
+ b memcopy_end
+
+// r10 = dest_end, r14 = source_end
+memcopy_overlapped
+ // Are we in the optimized case ?
+ cmp r0, #0
+ beq memcopy_overlapped_non_optim
+
+ // Optimized Overlapped - Read 32 bytes
+ sub r14, r14, #32
+ sub r12, r12, #32
+ cmp r12, #31
+ ldmia r14, {r2-r9}
+
+ // If length is less than 32 then disable optim
+ movls r0, #0
+
+ cmp r12, #0
+
+ // Optimized Overlapped - Write 32 bytes
+ sub r10, r10, #32
+ stmia r10, {r2-r9}
+
+ // while (length != 0)
+ bne memcopy_overlapped
+ b memcopy_end
+
+memcopy_default_non_optim
+ // Byte copy
+ ldrb r3, [r14], #1
+ sub r12, r12, #1
+ strb r3, [r10], #1
+
+memcopy_default
+ cmp r12, #0
+ beq memcopy_end
+
+// r10 = dest, r14 = source
+memcopy_default_loop
+ cmp r0, #0
+ beq memcopy_default_non_optim
+
+ // Optimized memcopy - Read 32 Bytes
+ sub r12, r12, #32
+ cmp r12, #31
+ ldmia r14!, {r2-r9}
+
+ // If length is less than 32 then disable optim
+ movls r0, #0
+
+ cmp r12, #0
+
+ // Optimized memcopy - Write 32 Bytes
+ stmia r10!, {r2-r9}
+
+ // while (length != 0)
+ bne memcopy_default_loop
+
+memcopy_end
+ mov r0, r11
+ ldmfd sp!, {r4-r11, pc}
+
+ END
+
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.S
new file mode 100644
index 000000000000..dc0e74e8657c
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.S
@@ -0,0 +1,146 @@
+// Copyright (c) 2010-2011, Linaro Limited
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of Linaro Limited nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+//
+// Written by Dave Gilbert <david.gilbert@linaro.org>
+//
+// This memchr routine is optimised on a Cortex-A9 and should work on
+// all ARMv7 processors. It has a fast past for short sizes, and has
+// an optimised path for large data sets; the worst case is finding the
+// match early in a large data set.
+//
+
+
+// 2011-02-07 david.gilbert@linaro.org
+// Extracted from local git a5b438d861
+// 2011-07-14 david.gilbert@linaro.org
+// Import endianness fix from local git ea786f1b
+// 2011-12-07 david.gilbert@linaro.org
+// Removed unneeded cbz from align loop
+
+// this lets us check a flag in a 00/ff byte easily in either endianness
+#define CHARTSTMASK(c) 1<<(c*8)
+
+ .text
+ .thumb
+ .syntax unified
+
+ .type ASM_PFX(InternalMemScanMem8), %function
+ASM_GLOBAL ASM_PFX(InternalMemScanMem8)
+ASM_PFX(InternalMemScanMem8):
+ // r0 = start of memory to scan
+ // r1 = length
+ // r2 = character to look for
+ // returns r0 = pointer to character or NULL if not found
+ uxtb r2, r2 // Don't think we can trust the caller to actually pass a char
+
+ cmp r1, #16 // If it's short don't bother with anything clever
+ blt 20f
+
+ tst r0, #7 // If it's already aligned skip the next bit
+ beq 10f
+
+ // Work up to an aligned point
+5:
+ ldrb r3, [r0],#1
+ subs r1, r1, #1
+ cmp r3, r2
+ beq 50f // If it matches exit found
+ tst r0, #7
+ bne 5b // If not aligned yet then do next byte
+
+10:
+ // At this point, we are aligned, we know we have at least 8 bytes to work with
+ push {r4-r7}
+ orr r2, r2, r2, lsl #8 // expand the match word across to all bytes
+ orr r2, r2, r2, lsl #16
+ bic r4, r1, #7 // Number of double words to work with
+ mvns r7, #0 // all F's
+ movs r3, #0
+
+15:
+ ldmia r0!, {r5,r6}
+ subs r4, r4, #8
+ eor r5, r5, r2 // Get it so that r5,r6 have 00's where the bytes match the target
+ eor r6, r6, r2
+ uadd8 r5, r5, r7 // Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r5, r3, r7 // bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ uadd8 r6, r6, r7 // Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r6, r5, r7 // chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ cbnz r6, 60f
+ bne 15b // (Flags from the subs above) If not run out of bytes then go around again
+
+ pop {r4-r7}
+ and r2, r2, #0xff // Get r2 back to a single character from the expansion above
+ and r1, r1, #7 // Leave the count remaining as the number after the double words have been done
+
+20:
+ cbz r1, 40f // 0 length or hit the end already then not found
+
+21: // Post aligned section, or just a short call
+ ldrb r3, [r0], #1
+ subs r1, r1, #1
+ eor r3, r3, r2 // r3 = 0 if match - doesn't break flags from sub
+ cbz r3, 50f
+ bne 21b // on r1 flags
+
+40:
+ movs r0, #0 // not found
+ bx lr
+
+50:
+ subs r0, r0, #1 // found
+ bx lr
+
+60: // We're here because the fast path found a hit - now we have to track down exactly which word it was
+ // r0 points to the start of the double word after the one that was tested
+ // r5 has the 00/ff pattern for the first word, r6 has the chained value
+ cmp r5, #0
+ itte eq
+ moveq r5, r6 // the end is in the 2nd word
+ subeq r0, r0, #3 // Points to 2nd byte of 2nd word
+ subne r0, r0, #7 // or 2nd byte of 1st word
+
+ // r0 currently points to the 3rd byte of the word containing the hit
+ tst r5, #CHARTSTMASK(0) // 1st character
+ bne 61f
+ adds r0, r0, #1
+ tst r5, #CHARTSTMASK(1) // 2nd character
+ ittt eq
+ addeq r0, r0 ,#1
+ tsteq r5, #(3 << 15) // 2nd & 3rd character
+ // If not the 3rd must be the last one
+ addeq r0, r0, #1
+
+61:
+ pop {r4-r7}
+ subs r0, r0, #1
+ bx lr
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.asm b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.asm
new file mode 100644
index 000000000000..abda87320e37
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.asm
@@ -0,0 +1,147 @@
+; Copyright (c) 2010-2011, Linaro Limited
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+;
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+;
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in the
+; documentation and/or other materials provided with the distribution.
+;
+; * Neither the name of Linaro Limited nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;
+
+;
+; Written by Dave Gilbert <david.gilbert@linaro.org>
+;
+; This memchr routine is optimised on a Cortex-A9 and should work on
+; all ARMv7 processors. It has a fast past for short sizes, and has
+; an optimised path for large data sets; the worst case is finding the
+; match early in a large data set.
+;
+
+
+; 2011-02-07 david.gilbert@linaro.org
+; Extracted from local git a5b438d861
+; 2011-07-14 david.gilbert@linaro.org
+; Import endianness fix from local git ea786f1b
+; 2011-12-07 david.gilbert@linaro.org
+; Removed unneeded cbz from align loop
+
+; this lets us check a flag in a 00/ff byte easily in either endianness
+#define CHARTSTMASK(c) 1<<(c*8)
+
+ EXPORT InternalMemScanMem8
+ AREA ScanMem, CODE, READONLY
+ THUMB
+
+InternalMemScanMem8
+ ; r0 = start of memory to scan
+ ; r1 = length
+ ; r2 = character to look for
+ ; returns r0 = pointer to character or NULL if not found
+ uxtb r2, r2 ; Don't think we can trust the caller to actually pass a char
+
+ cmp r1, #16 ; If it's short don't bother with anything clever
+ blt L20
+
+ tst r0, #7 ; If it's already aligned skip the next bit
+ beq L10
+
+ ; Work up to an aligned point
+L5
+ ldrb r3, [r0],#1
+ subs r1, r1, #1
+ cmp r3, r2
+ beq L50 ; If it matches exit found
+ tst r0, #7
+ bne L5 ; If not aligned yet then do next byte
+
+L10
+ ; At this point, we are aligned, we know we have at least 8 bytes to work with
+ push {r4-r7}
+ orr r2, r2, r2, lsl #8 ; expand the match word across to all bytes
+ orr r2, r2, r2, lsl #16
+ bic r4, r1, #7 ; Number of double words to work with
+ mvns r7, #0 ; all F's
+ movs r3, #0
+
+L15
+ ldmia r0!, {r5,r6}
+ subs r4, r4, #8
+ eor r5, r5, r2 ; Get it so that r5,r6 have 00's where the bytes match the target
+ eor r6, r6, r2
+ uadd8 r5, r5, r7 ; Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r5, r3, r7 ; bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ uadd8 r6, r6, r7 ; Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r6, r5, r7 ; chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ cbnz r6, L60
+ bne L15 ; (Flags from the subs above) If not run out of bytes then go around again
+
+ pop {r4-r7}
+ and r2, r2, #0xff ; Get r2 back to a single character from the expansion above
+ and r1, r1, #7 ; Leave the count remaining as the number after the double words have been done
+
+L20
+ cbz r1, L40 ; 0 length or hit the end already then not found
+
+L21 ; Post aligned section, or just a short call
+ ldrb r3, [r0], #1
+ subs r1, r1, #1
+ eor r3, r3, r2 ; r3 = 0 if match - doesn't break flags from sub
+ cbz r3, L50
+ bne L21 ; on r1 flags
+
+L40
+ movs r0, #0 ; not found
+ bx lr
+
+L50
+ subs r0, r0, #1 ; found
+ bx lr
+
+L60 ; We're here because the fast path found a hit - now we have to track down exactly which word it was
+ ; r0 points to the start of the double word after the one that was tested
+ ; r5 has the 00/ff pattern for the first word, r6 has the chained value
+ cmp r5, #0
+ itte eq
+ moveq r5, r6 ; the end is in the 2nd word
+ subeq r0, r0, #3 ; Points to 2nd byte of 2nd word
+ subne r0, r0, #7 ; or 2nd byte of 1st word
+
+ ; r0 currently points to the 3rd byte of the word containing the hit
+ tst r5, #CHARTSTMASK(0) ; 1st character
+ bne L61
+ adds r0, r0, #1
+ tst r5, #CHARTSTMASK(1) ; 2nd character
+ ittt eq
+ addeq r0, r0 ,#1
+ tsteq r5, #(3 << 15) ; 2nd & 3rd character
+ ; If not the 3rd must be the last one
+ addeq r0, r0, #1
+
+L61
+ pop {r4-r7}
+ subs r0, r0, #1
+ bx lr
+
+ END
+
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMemGeneric.c b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMemGeneric.c
new file mode 100644
index 000000000000..20fa7e9be697
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMemGeneric.c
@@ -0,0 +1,142 @@
+/** @file
+ Architecture Independent Base Memory Library Implementation.
+
+ The following BaseMemoryLib instances contain the same copy of this file:
+ BaseMemoryLib
+ PeiMemoryLib
+ UefiMemoryLib
+
+ Copyright (c) 2006 - 2016, Intel Corporation. All rights reserved.<BR>
+ This program and the accompanying materials
+ are licensed and made available under the terms and conditions of the BSD License
+ which accompanies this distribution. The full text of the license may be found at
+ http://opensource.org/licenses/bsd-license.php.
+
+ THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,
+ WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
+
+**/
+
+#include "../MemLibInternals.h"
+
+/**
+ Scans a target buffer for a 16-bit value, and returns a pointer to the
+ matching 16-bit value in the target buffer.
+
+ @param Buffer The pointer to the target buffer to scan.
+ @param Length The count of 16-bit value to scan. Must be non-zero.
+ @param Value The value to search for in the target buffer.
+
+ @return The pointer to the first occurrence, or NULL if not found.
+
+**/
+CONST VOID *
+EFIAPI
+InternalMemScanMem16 (
+ IN CONST VOID *Buffer,
+ IN UINTN Length,
+ IN UINT16 Value
+ )
+{
+ CONST UINT16 *Pointer;
+
+ Pointer = (CONST UINT16*)Buffer;
+ do {
+ if (*Pointer == Value) {
+ return Pointer;
+ }
+ ++Pointer;
+ } while (--Length != 0);
+ return NULL;
+}
+
+/**
+ Scans a target buffer for a 32-bit value, and returns a pointer to the
+ matching 32-bit value in the target buffer.
+
+ @param Buffer The pointer to the target buffer to scan.
+ @param Length The count of 32-bit value to scan. Must be non-zero.
+ @param Value The value to search for in the target buffer.
+
+ @return The pointer to the first occurrence, or NULL if not found.
+
+**/
+CONST VOID *
+EFIAPI
+InternalMemScanMem32 (
+ IN CONST VOID *Buffer,
+ IN UINTN Length,
+ IN UINT32 Value
+ )
+{
+ CONST UINT32 *Pointer;
+
+ Pointer = (CONST UINT32*)Buffer;
+ do {
+ if (*Pointer == Value) {
+ return Pointer;
+ }
+ ++Pointer;
+ } while (--Length != 0);
+ return NULL;
+}
+
+/**
+ Scans a target buffer for a 64-bit value, and returns a pointer to the
+ matching 64-bit value in the target buffer.
+
+ @param Buffer The pointer to the target buffer to scan.
+ @param Length The count of 64-bit value to scan. Must be non-zero.
+ @param Value The value to search for in the target buffer.
+
+ @return The pointer to the first occurrence, or NULL if not found.
+
+**/
+CONST VOID *
+EFIAPI
+InternalMemScanMem64 (
+ IN CONST VOID *Buffer,
+ IN UINTN Length,
+ IN UINT64 Value
+ )
+{
+ CONST UINT64 *Pointer;
+
+ Pointer = (CONST UINT64*)Buffer;
+ do {
+ if (*Pointer == Value) {
+ return Pointer;
+ }
+ ++Pointer;
+ } while (--Length != 0);
+ return NULL;
+}
+
+/**
+ Checks whether the contents of a buffer are all zeros.
+
+ @param Buffer The pointer to the buffer to be checked.
+ @param Length The size of the buffer (in bytes) to be checked.
+
+ @retval TRUE Contents of the buffer are all zeros.
+ @retval FALSE Contents of the buffer are not all zeros.
+
+**/
+BOOLEAN
+EFIAPI
+InternalMemIsZeroBuffer (
+ IN CONST VOID *Buffer,
+ IN UINTN Length
+ )
+{
+ CONST UINT8 *BufferData;
+ UINTN Index;
+
+ BufferData = Buffer;
+ for (Index = 0; Index < Length; Index++) {
+ if (BufferData[Index] != 0) {
+ return FALSE;
+ }
+ }
+ return TRUE;
+}
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.S
new file mode 100644
index 000000000000..c1755539d36a
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.S
@@ -0,0 +1,77 @@
+#------------------------------------------------------------------------------
+#
+# Copyright (c) 2016, Linaro Ltd. All rights reserved.<BR>
+#
+# This program and the accompanying materials are licensed and made available
+# under the terms and conditions of the BSD License which accompanies this
+# distribution. The full text of the license may be found at
+# http://opensource.org/licenses/bsd-license.php
+#
+# THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,
+# WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
+#
+#------------------------------------------------------------------------------
+
+ .text
+ .thumb
+ .syntax unified
+ .align 5
+ASM_GLOBAL ASM_PFX(InternalMemZeroMem)
+ASM_PFX(InternalMemZeroMem):
+ movs r2, #0
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem)
+ASM_PFX(InternalMemSetMem):
+ uxtb r2, r2
+ orr r2, r2, r2, lsl #8
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem16)
+ASM_PFX(InternalMemSetMem16):
+ uxth r2, r2
+ orr r2, r2, r2, lsl #16
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem32)
+ASM_PFX(InternalMemSetMem32):
+ mov r3, r2
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem64)
+ASM_PFX(InternalMemSetMem64):
+ push {r4, lr}
+ cmp r1, #16 // fewer than 16 bytes of input?
+ add r1, r1, r0 // r1 := dst + length
+ add lr, r0, #16
+ blt 2f
+ bic lr, lr, #15 // align output pointer
+
+ str r2, [r0] // potentially unaligned store of 4 bytes
+ str r3, [r0, #4] // potentially unaligned store of 4 bytes
+ str r2, [r0, #8] // potentially unaligned store of 4 bytes
+ str r3, [r0, #12] // potentially unaligned store of 4 bytes
+ beq 1f
+
+0: add lr, lr, #16 // advance the output pointer by 16 bytes
+ subs r4, r1, lr // past the output?
+ blt 3f // break out of the loop
+ strd r2, r3, [lr, #-16] // aligned store of 16 bytes
+ strd r2, r3, [lr, #-8]
+ bne 0b // goto beginning of loop
+1: pop {r4, pc}
+
+2: subs r4, r1, lr
+3: adds r4, r4, #16
+ subs r1, r1, #8
+ cmp r4, #4 // between 4 and 15 bytes?
+ blt 4f
+ cmp r4, #8 // between 8 and 15 bytes?
+ str r2, [lr, #-16] // overlapping store of 4 + (4 + 4) + 4 bytes
+ itt gt
+ strgt r3, [lr, #-12]
+ strgt r2, [r1]
+ str r3, [r1, #4]
+ pop {r4, pc}
+
+4: cmp r4, #2 // 2 or 3 bytes?
+ strb r2, [lr, #-16] // store 1 byte
+ it ge
+ strhge r2, [r1, #6] // store 2 bytes
+ pop {r4, pc}
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.asm b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.asm
new file mode 100644
index 000000000000..2a8dc7d019f4
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.asm
@@ -0,0 +1,84 @@
+;------------------------------------------------------------------------------
+;
+; Copyright (c) 2016, Linaro Ltd. All rights reserved.<BR>
+;
+; This program and the accompanying materials are licensed and made available
+; under the terms and conditions of the BSD License which accompanies this
+; distribution. The full text of the license may be found at
+; http://opensource.org/licenses/bsd-license.php
+;
+; THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,
+; WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
+;
+;------------------------------------------------------------------------------
+
+ EXPORT InternalMemZeroMem
+ EXPORT InternalMemSetMem
+ EXPORT InternalMemSetMem16
+ EXPORT InternalMemSetMem32
+ EXPORT InternalMemSetMem64
+
+ AREA SetMem, CODE, READONLY, CODEALIGN, ALIGN=5
+ THUMB
+
+InternalMemZeroMem
+ movs r2, #0
+
+InternalMemSetMem
+ uxtb r2, r2
+ orr r2, r2, r2, lsl #8
+
+InternalMemSetMem16
+ uxth r2, r2
+ orr r2, r2, r2, lsr #16
+
+InternalMemSetMem32
+ mov r3, r2
+
+InternalMemSetMem64
+ push {r4, lr}
+ cmp r1, #16 ; fewer than 16 bytes of input?
+ add r1, r1, r0 ; r1 := dst + length
+ add lr, r0, #16
+ blt L2
+ bic lr, lr, #15 ; align output pointer
+
+ str r2, [r0] ; potentially unaligned store of 4 bytes
+ str r3, [r0, #4] ; potentially unaligned store of 4 bytes
+ str r2, [r0, #8] ; potentially unaligned store of 4 bytes
+ str r3, [r0, #12] ; potentially unaligned store of 4 bytes
+ beq L1
+
+L0
+ add lr, lr, #16 ; advance the output pointer by 16 bytes
+ subs r4, r1, lr ; past the output?
+ blt L3 ; break out of the loop
+ strd r2, r3, [lr, #-16] ; aligned store of 16 bytes
+ strd r2, r3, [lr, #-8]
+ bne L0 ; goto beginning of loop
+L1
+ pop {r4, pc}
+
+L2
+ subs r4, r1, lr
+L3
+ adds r4, r4, #16
+ subs r1, r1, #8
+ cmp r4, #4 ; between 4 and 15 bytes?
+ blt L4
+ cmp r4, #8 ; between 8 and 15 bytes?
+ str r2, [lr, #-16] ; overlapping store of 4 + (4 + 4) + 4 bytes
+ itt gt
+ strgt r3, [lr, #-12]
+ strgt r2, [r1]
+ str r3, [r1, #4]
+ pop {r4, pc}
+
+L4
+ cmp r4, #2 ; 2 or 3 bytes?
+ strb r2, [lr, #-16] ; store 1 byte
+ it ge
+ strhge r2, [r1, #6] ; store 2 bytes
+ pop {r4, pc}
+
+ END
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf b/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf
index 71691b9859e3..d95eb599ea9e 100644
--- a/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf
@@ -27,7 +27,7 @@ [Defines]
#
-# VALID_ARCHITECTURES = IA32 X64
+# VALID_ARCHITECTURES = IA32 X64 ARM
#
[Sources]
@@ -79,19 +79,6 @@ [Sources.Ia32]
Ia32/CopyMem.nasm
Ia32/CopyMem.asm
Ia32/IsZeroBuffer.nasm
- ScanMem64Wrapper.c
- ScanMem32Wrapper.c
- ScanMem16Wrapper.c
- ScanMem8Wrapper.c
- ZeroMemWrapper.c
- CompareMemWrapper.c
- SetMem64Wrapper.c
- SetMem32Wrapper.c
- SetMem16Wrapper.c
- SetMemWrapper.c
- CopyMemWrapper.c
- IsZeroBufferWrapper.c
- MemLibGuid.c
[Sources.X64]
X64/ScanMem64.nasm
@@ -128,6 +115,21 @@ [Sources.X64]
X64/CopyMem.asm
X64/CopyMem.S
X64/IsZeroBuffer.nasm
+
+[Sources.ARM]
+ Arm/ScanMem.S |GCC
+ Arm/SetMem.S |GCC
+ Arm/CopyMem.S |GCC
+ Arm/CompareMem.S |GCC
+
+ Arm/ScanMem.asm |RVCT
+ Arm/SetMem.asm |RVCT
+ Arm/CopyMem.asm |RVCT
+ Arm/CompareMem.asm |RVCT
+
+ Arm/ScanMemGeneric.c
+
+[Sources]
ScanMem64Wrapper.c
ScanMem32Wrapper.c
ScanMem16Wrapper.c
--
2.7.4
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v5 3/4] MdePkg/BaseMemoryLibOptDxe: add accelerated AARCH64 routines
2016-09-09 14:00 [PATCH v5 0/4] MdePkg: add ARM/AARCH64 support to BaseMemoryLib Ard Biesheuvel
2016-09-09 14:00 ` [PATCH v5 1/4] MdePkg/BaseMemoryLib: widen aligned accesses to 32 or 64 bits Ard Biesheuvel
2016-09-09 14:00 ` [PATCH v5 2/4] MdePkg/BaseMemoryLibOptDxe: add accelerated ARM routines Ard Biesheuvel
@ 2016-09-09 14:00 ` Ard Biesheuvel
2016-09-09 14:00 ` [PATCH v5 4/4] MdePkg/BaseMemoryLibOptDxe ARM|AARCH64: disallow use in SEC & PEI phases Ard Biesheuvel
3 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2016-09-09 14:00 UTC (permalink / raw)
To: edk2-devel, liming.gao, leif.lindholm, michael.d.kinney; +Cc: Ard Biesheuvel
This adds AARCH64 support to BaseMemoryLibOptDxe, based on the cortex-strings
library. All string routines are accelerated except ScanMem16, ScanMem32,
ScanMem64 and IsZeroBuffer, which can wait for another day. (Very few
occurrences exist in the codebase)
Contributed-under: TianoCore Contribution Agreement 1.0
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Liming Gao <liming.gao@intel.com>
---
MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CompareMem.S | 142 ++++++++++
MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S | 284 ++++++++++++++++++++
MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S | 161 +++++++++++
MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S | 244 +++++++++++++++++
MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf | 9 +-
5 files changed, 839 insertions(+), 1 deletion(-)
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CompareMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CompareMem.S
new file mode 100644
index 000000000000..a54de6948be1
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CompareMem.S
@@ -0,0 +1,142 @@
+//
+// Copyright (c) 2013, Linaro Limited
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// * Neither the name of the Linaro nor the
+// names of its contributors may be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+// Assumptions:
+//
+// ARMv8-a, AArch64
+//
+
+
+// Parameters and result.
+#define src1 x0
+#define src2 x1
+#define limit x2
+#define result x0
+
+// Internal variables.
+#define data1 x3
+#define data1w w3
+#define data2 x4
+#define data2w w4
+#define diff x6
+#define endloop x7
+#define tmp1 x8
+#define tmp2 x9
+#define pos x11
+#define limit_wd x12
+#define mask x13
+
+ .p2align 6
+ASM_GLOBAL ASM_PFX(InternalMemCompareMem)
+ASM_PFX(InternalMemCompareMem):
+ eor tmp1, src1, src2
+ tst tmp1, #7
+ b.ne .Lmisaligned8
+ ands tmp1, src1, #7
+ b.ne .Lmutual_align
+ add limit_wd, limit, #7
+ lsr limit_wd, limit_wd, #3
+
+ // Start of performance-critical section -- one 64B cache line.
+.Lloop_aligned:
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+.Lstart_realigned:
+ subs limit_wd, limit_wd, #1
+ eor diff, data1, data2 // Non-zero if differences found.
+ csinv endloop, diff, xzr, ne // Last Dword or differences.
+ cbz endloop, .Lloop_aligned
+ // End of performance-critical section -- one 64B cache line.
+
+ // Not reached the limit, must have found a diff.
+ cbnz limit_wd, .Lnot_limit
+
+ // Limit % 8 == 0 => all bytes significant.
+ ands limit, limit, #7
+ b.eq .Lnot_limit
+
+ lsl limit, limit, #3 // Bits -> bytes.
+ mov mask, #~0
+ lsl mask, mask, limit
+ bic data1, data1, mask
+ bic data2, data2, mask
+
+ orr diff, diff, mask
+
+.Lnot_limit:
+ rev diff, diff
+ rev data1, data1
+ rev data2, data2
+
+ // The MS-non-zero bit of DIFF marks either the first bit
+ // that is different, or the end of the significant data.
+ // Shifting left now will bring the critical information into the
+ // top bits.
+ clz pos, diff
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+
+ // But we need to zero-extend (char is unsigned) the value and then
+ // perform a signed 32-bit subtraction.
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+
+.Lmutual_align:
+ // Sources are mutually aligned, but are not currently at an
+ // alignment boundary. Round down the addresses and then mask off
+ // the bytes that precede the start point.
+ bic src1, src1, #7
+ bic src2, src2, #7
+ add limit, limit, tmp1 // Adjust the limit for the extra.
+ lsl tmp1, tmp1, #3 // Bytes beyond alignment -> bits.
+ ldr data1, [src1], #8
+ neg tmp1, tmp1 // Bits to alignment -64.
+ ldr data2, [src2], #8
+ mov tmp2, #~0
+
+ // Little-endian. Early bytes are at LSB.
+ lsr tmp2, tmp2, tmp1 // Shift (tmp1 & 63).
+ add limit_wd, limit, #7
+ orr data1, data1, tmp2
+ orr data2, data2, tmp2
+ lsr limit_wd, limit_wd, #3
+ b .Lstart_realigned
+
+ .p2align 6
+.Lmisaligned8:
+ sub limit, limit, #1
+1:
+ // Perhaps we can do better than this.
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ subs limit, limit, #1
+ ccmp data1w, data2w, #0, cs // NZCV = 0b0000.
+ b.eq 1b
+ sub result, data1, data2
+ ret
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S
new file mode 100644
index 000000000000..10b55b065c47
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S
@@ -0,0 +1,284 @@
+//
+// Copyright (c) 2012 - 2016, Linaro Limited
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// * Neither the name of the Linaro nor the
+// names of its contributors may be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+//
+// Copyright (c) 2015 ARM Ltd
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// 3. The name of the company may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+//
+// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+// Assumptions:
+//
+// ARMv8-a, AArch64, unaligned accesses.
+//
+//
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define A_hw w7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l x14
+#define E_h x15
+#define F_l srcend
+#define F_h dst
+#define tmp1 x9
+#define tmp2 x3
+
+#define L(l) .L ## l
+
+// Copies are split into 3 main cases: small copies of up to 16 bytes,
+// medium copies of 17..96 bytes which are fully unrolled. Large copies
+// of more than 96 bytes align the destination and use an unrolled loop
+// processing 64 bytes per iteration.
+// Small and medium copies read all data before writing, allowing any
+// kind of overlap, and memmove tailcalls memcpy for these cases as
+// well as non-overlapping copies.
+
+__memcpy:
+ prfm PLDL1KEEP, [src]
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 16
+ b.ls L(copy16)
+ cmp count, 96
+ b.hi L(copy_long)
+
+ // Medium copies: 17..96 bytes.
+ sub tmp1, count, 1
+ ldp A_l, A_h, [src]
+ tbnz tmp1, 6, L(copy96)
+ ldp D_l, D_h, [srcend, -16]
+ tbz tmp1, 5, 1f
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+1:
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ // Small copies: 0..16 bytes.
+L(copy16):
+ cmp count, 8
+ b.lo 1f
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+ .p2align 4
+1:
+ tbz count, 2, 1f
+ ldr A_lw, [src]
+ ldr A_hw, [srcend, -4]
+ str A_lw, [dstin]
+ str A_hw, [dstend, -4]
+ ret
+
+ // Copy 0..3 bytes. Use a branchless sequence that copies the same
+ // byte 3 times if count==1, or the 2nd byte twice if count==2.
+1:
+ cbz count, 2f
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb A_hw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
+2: ret
+
+ .p2align 4
+ // Copy 64..96 bytes. Copy 64 bytes from the start and
+ // 32 bytes from the end.
+L(copy96):
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [src, 32]
+ ldp D_l, D_h, [src, 48]
+ ldp E_l, E_h, [srcend, -32]
+ ldp F_l, F_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin, 32]
+ stp D_l, D_h, [dstin, 48]
+ stp E_l, E_h, [dstend, -32]
+ stp F_l, F_h, [dstend, -16]
+ ret
+
+ // Align DST to 16 byte alignment so that we don't cross cache line
+ // boundaries on both loads and stores. There are at least 96 bytes
+ // to copy, so copy 16 bytes unaligned and then align. The loop
+ // copies 64 bytes per iteration and prefetches one iteration ahead.
+
+ .p2align 4
+L(copy_long):
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ ldp D_l, D_h, [src]
+ sub src, src, tmp1
+ add count, count, tmp1 // Count is now 16 too large.
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 // Test and readjust count.
+ b.ls 2f
+1:
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi 1b
+
+ // Write the last full set of 64 bytes. The remainder is at most 64
+ // bytes, so it is safe to always copy 64 bytes from the end even if
+ // there is just 1 byte left.
+2:
+ ldp E_l, E_h, [srcend, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [srcend, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [srcend, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend, -64]
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
+ ret
+
+
+//
+// All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
+// Larger backwards copies are also handled by memcpy. The only remaining
+// case is forward large copies. The destination is aligned, and an
+// unrolled loop processes 64 bytes per iteration.
+//
+
+ASM_GLOBAL ASM_PFX(InternalMemCopyMem)
+ASM_PFX(InternalMemCopyMem):
+ sub tmp2, dstin, src
+ cmp count, 96
+ ccmp tmp2, count, 2, hi
+ b.hs __memcpy
+
+ cbz tmp2, 3f
+ add dstend, dstin, count
+ add srcend, src, count
+
+ // Align dstend to 16 byte alignment so that we don't cross cache line
+ // boundaries on both loads and stores. There are at least 96 bytes
+ // to copy, so copy 16 bytes unaligned and then align. The loop
+ // copies 64 bytes per iteration and prefetches one iteration ahead.
+
+ and tmp2, dstend, 15
+ ldp D_l, D_h, [srcend, -16]
+ sub srcend, srcend, tmp2
+ sub count, count, tmp2
+ ldp A_l, A_h, [srcend, -16]
+ stp D_l, D_h, [dstend, -16]
+ ldp B_l, B_h, [srcend, -32]
+ ldp C_l, C_h, [srcend, -48]
+ ldp D_l, D_h, [srcend, -64]!
+ sub dstend, dstend, tmp2
+ subs count, count, 128
+ b.ls 2f
+ nop
+1:
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [srcend, -16]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [srcend, -48]
+ stp D_l, D_h, [dstend, -64]!
+ ldp D_l, D_h, [srcend, -64]!
+ subs count, count, 64
+ b.hi 1b
+
+ // Write the last full set of 64 bytes. The remainder is at most 64
+ // bytes, so it is safe to always copy 64 bytes from the start even if
+ // there is just 1 byte left.
+2:
+ ldp E_l, E_h, [src, 48]
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [src]
+ stp D_l, D_h, [dstend, -64]
+ stp E_l, E_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
+3: ret
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S
new file mode 100644
index 000000000000..08e1fbb17082
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S
@@ -0,0 +1,161 @@
+//
+// Copyright (c) 2014, ARM Limited
+// All rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// * Neither the name of the company nor the names of its contributors
+// may be used to endorse or promote products derived from this
+// software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+// Assumptions:
+//
+// ARMv8-a, AArch64
+// Neon Available.
+//
+
+// Arguments and results.
+#define srcin x0
+#define cntin x1
+#define chrin w2
+
+#define result x0
+
+#define src x3
+#define tmp x4
+#define wtmp2 w5
+#define synd x6
+#define soff x9
+#define cntrem x10
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_chr1 v3
+#define vhas_chr2 v4
+#define vrepmask v5
+#define vend v6
+
+//
+// Core algorithm:
+//
+// For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+// per byte. For each tuple, bit 0 is set if the relevant byte matched the
+// requested character and bit 1 is not used (faster than using a 32bit
+// syndrome). Since the bits in the syndrome reflect exactly the order in which
+// things occur in the original string, counting trailing zeros allows to
+// identify exactly which byte has matched.
+//
+
+ASM_GLOBAL ASM_PFX(InternalMemScanMem8)
+ASM_PFX(InternalMemScanMem8):
+ // Do not dereference srcin if no bytes to compare.
+ cbz cntin, .Lzero_length
+ //
+ // Magic constant 0x40100401 allows us to identify which lane matches
+ // the requested byte.
+ //
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ // Work with aligned 32-byte chunks
+ bic src, srcin, #31
+ dup vrepmask.4s, wtmp2
+ ands soff, srcin, #31
+ and cntrem, cntin, #31
+ b.eq .Lloop
+
+ //
+ // Input string is not 32-byte aligned. We calculate the syndrome
+ // value for the aligned 32 bytes block containing the first bytes
+ // and mask the irrelevant part.
+ //
+
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ sub tmp, soff, #32
+ adds cntin, cntin, tmp
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ addp vend.16b, vend.16b, vend.16b // 128->64
+ mov synd, vend.d[0]
+ // Clear the soff*2 lower bits
+ lsl tmp, soff, #1
+ lsr synd, synd, tmp
+ lsl synd, synd, tmp
+ // The first block can also be the last
+ b.ls .Lmasklast
+ // Have we found something already?
+ cbnz synd, .Ltail
+
+.Lloop:
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ subs cntin, cntin, #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ // If we're out of data we finish regardless of the result
+ b.ls .Lend
+ // Use a fast check for the termination condition
+ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
+ addp vend.2d, vend.2d, vend.2d
+ mov synd, vend.d[0]
+ // We're not out of data, loop if we haven't found the character
+ cbz synd, .Lloop
+
+.Lend:
+ // Termination condition found, let's calculate the syndrome value
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ addp vend.16b, vend.16b, vend.16b // 128->64
+ mov synd, vend.d[0]
+ // Only do the clear for the last possible block
+ b.hi .Ltail
+
+.Lmasklast:
+ // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits
+ add tmp, cntrem, soff
+ and tmp, tmp, #31
+ sub tmp, tmp, #32
+ neg tmp, tmp, lsl #1
+ lsl synd, synd, tmp
+ lsr synd, synd, tmp
+
+.Ltail:
+ // Count the trailing zeros using bit reversing
+ rbit synd, synd
+ // Compensate the last post-increment
+ sub src, src, #32
+ // Check that we have found a character
+ cmp synd, #0
+ // And count the leading zeros
+ clz synd, synd
+ // Compute the potential result
+ add result, src, synd, lsr #1
+ // Select result or NULL
+ csel result, xzr, result, eq
+ ret
+
+.Lzero_length:
+ mov result, #0
+ ret
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S
new file mode 100644
index 000000000000..7f361110d4fe
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S
@@ -0,0 +1,244 @@
+//
+// Copyright (c) 2012 - 2016, Linaro Limited
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// * Neither the name of the Linaro nor the
+// names of its contributors may be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+//
+// Copyright (c) 2015 ARM Ltd
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// 3. The name of the company may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+//
+// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+// Assumptions:
+//
+// ARMv8-a, AArch64, unaligned accesses
+//
+//
+
+#define dstin x0
+#define count x1
+#define val x2
+#define valw w2
+#define dst x3
+#define dstend x4
+#define tmp1 x5
+#define tmp1w w5
+#define tmp2 x6
+#define tmp2w w6
+#define zva_len x7
+#define zva_lenw w7
+
+#define L(l) .L ## l
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem16)
+ASM_PFX(InternalMemSetMem16):
+ dup v0.8H, valw
+ b 0f
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem32)
+ASM_PFX(InternalMemSetMem32):
+ dup v0.4S, valw
+ b 0f
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem64)
+ASM_PFX(InternalMemSetMem64):
+ dup v0.2D, val
+ b 0f
+
+ASM_GLOBAL ASM_PFX(InternalMemZeroMem)
+ASM_PFX(InternalMemZeroMem):
+ movi v0.16B, #0
+ b 0f
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem)
+ASM_PFX(InternalMemSetMem):
+ dup v0.16B, valw
+0: add dstend, dstin, count
+ mov val, v0.D[0]
+
+ cmp count, 96
+ b.hi L(set_long)
+ cmp count, 16
+ b.hs L(set_medium)
+
+ // Set 0..15 bytes.
+ tbz count, 3, 1f
+ str val, [dstin]
+ str val, [dstend, -8]
+ ret
+ nop
+1: tbz count, 2, 2f
+ str valw, [dstin]
+ str valw, [dstend, -4]
+ ret
+2: cbz count, 3f
+ strb valw, [dstin]
+ tbz count, 1, 3f
+ strh valw, [dstend, -2]
+3: ret
+
+ // Set 17..96 bytes.
+L(set_medium):
+ str q0, [dstin]
+ tbnz count, 6, L(set96)
+ str q0, [dstend, -16]
+ tbz count, 5, 1f
+ str q0, [dstin, 16]
+ str q0, [dstend, -32]
+1: ret
+
+ .p2align 4
+ // Set 64..96 bytes. Write 64 bytes from the start and
+ // 32 bytes from the end.
+L(set96):
+ str q0, [dstin, 16]
+ stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 3
+ nop
+L(set_long):
+ bic dst, dstin, 15
+ str q0, [dstin]
+ cmp count, 256
+ ccmp val, 0, 0, cs
+ b.eq L(try_zva)
+L(no_zva):
+ sub count, dstend, dst // Count is 16 too large.
+ add dst, dst, 16
+ sub count, count, 64 + 16 // Adjust count and bias for loop.
+1: stp q0, q0, [dst], 64
+ stp q0, q0, [dst, -32]
+L(tail64):
+ subs count, count, 64
+ b.hi 1b
+2: stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 3
+L(try_zva):
+ mrs tmp1, dczid_el0
+ tbnz tmp1w, 4, L(no_zva)
+ and tmp1w, tmp1w, 15
+ cmp tmp1w, 4 // ZVA size is 64 bytes.
+ b.ne L(zva_128)
+
+ // Write the first and last 64 byte aligned block using stp rather
+ // than using DC ZVA. This is faster on some cores.
+L(zva_64):
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ bic dst, dst, 63
+ stp q0, q0, [dst, 64]
+ stp q0, q0, [dst, 96]
+ sub count, dstend, dst // Count is now 128 too large.
+ sub count, count, 128+64+64 // Adjust count and bias for loop.
+ add dst, dst, 128
+ nop
+1: dc zva, dst
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi 1b
+ stp q0, q0, [dst, 0]
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 3
+L(zva_128):
+ cmp tmp1w, 5 // ZVA size is 128 bytes.
+ b.ne L(zva_other)
+
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dst, 64]
+ stp q0, q0, [dst, 96]
+ bic dst, dst, 127
+ sub count, dstend, dst // Count is now 128 too large.
+ sub count, count, 128+128 // Adjust count and bias for loop.
+ add dst, dst, 128
+1: dc zva, dst
+ add dst, dst, 128
+ subs count, count, 128
+ b.hi 1b
+ stp q0, q0, [dstend, -128]
+ stp q0, q0, [dstend, -96]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+L(zva_other):
+ mov tmp2w, 4
+ lsl zva_lenw, tmp2w, tmp1w
+ add tmp1, zva_len, 64 // Max alignment bytes written.
+ cmp count, tmp1
+ blo L(no_zva)
+
+ sub tmp2, zva_len, 1
+ add tmp1, dst, zva_len
+ add dst, dst, 16
+ subs count, tmp1, dst // Actual alignment bytes to write.
+ bic tmp1, tmp1, tmp2 // Aligned dc zva start address.
+ beq 2f
+1: stp q0, q0, [dst], 64
+ stp q0, q0, [dst, -32]
+ subs count, count, 64
+ b.hi 1b
+2: mov dst, tmp1
+ sub count, dstend, tmp1 // Remaining bytes to write.
+ subs count, count, zva_len
+ b.lo 4f
+3: dc zva, dst
+ add dst, dst, zva_len
+ subs count, count, zva_len
+ b.hs 3b
+4: add count, count, zva_len
+ b L(tail64)
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf b/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf
index d95eb599ea9e..64d11b09ef06 100644
--- a/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf
@@ -27,7 +27,7 @@ [Defines]
#
-# VALID_ARCHITECTURES = IA32 X64 ARM
+# VALID_ARCHITECTURES = IA32 X64 ARM AARCH64
#
[Sources]
@@ -127,6 +127,13 @@ [Sources.ARM]
Arm/CopyMem.asm |RVCT
Arm/CompareMem.asm |RVCT
+[Sources.AARCH64]
+ AArch64/ScanMem.S
+ AArch64/SetMem.S
+ AArch64/CopyMem.S
+ AArch64/CompareMem.S
+
+[Sources.ARM, Sources.AARCH64]
Arm/ScanMemGeneric.c
[Sources]
--
2.7.4
^ permalink raw reply related [flat|nested] 14+ messages in thread