From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <ard.biesheuvel@linaro.org>
Received: from mail-wm0-x22d.google.com (mail-wm0-x22d.google.com
 [IPv6:2a00:1450:400c:c09::22d])
 (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
 (No client certificate requested)
 by ml01.01.org (Postfix) with ESMTPS id 91D341A1E06
 for <edk2-devel@lists.01.org>; Fri,  2 Sep 2016 04:40:50 -0700 (PDT)
Received: by mail-wm0-x22d.google.com with SMTP id v143so26744368wmv.0
 for <edk2-devel@lists.01.org>; Fri, 02 Sep 2016 04:40:50 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linaro.org; s=google;
 h=from:to:cc:subject:date:message-id:in-reply-to:references;
 bh=/75CRNf8xZZ1v2e1MqVlUcZa6KDsKbWL+f6kjgTDCrg=;
 b=Ggz8WC8L+MwMKP66HI6xCPAlwE0RBOhG83VDRTXqNg6IvS0GdsOBUECfTWww2qBV1H
 Tox0waF2pLh/8ETQQQyXb8jhKIMBzjm5Q+ixbTmBaH/HkLh9L+cvGVSTXdNwoxBVSrW5
 AjCh46T+V2yn64meV7w2yogCb6y2gsPa26LMY=
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20130820;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references;
 bh=/75CRNf8xZZ1v2e1MqVlUcZa6KDsKbWL+f6kjgTDCrg=;
 b=f4wmBuGypWAydG7vS75rBpH8rOCnsfYY45vfOpjGHCxg5IWKoa/fAx8u8EpuSDG9Ha
 3/rK/d84toEBl8i6FjPUANM5xV0+cwEHCyNhZTOf+Xna0kGLBHOWYVZRhe8j1gQM7X0m
 EMM0cAKZk2w7O2dG8UafeZBxprSmV2N7/AhXmOFlqX0LLRfbhXUKC9thYSLHk+4Ovc+6
 4QnakbYzAT1Uzg6xU1QcHNbfBtMbvq6DcehYIYpzkfDlpYmZANRv2Yr07qiYG1Ic4IJR
 avgMkJLTIQL0h0QcUmom3vBbp4nsWfughtiPoq+SUl/+GCAU6HNRCksV4xAayWWzLzYv
 KUHQ==
X-Gm-Message-State: AE9vXwMXOdWDszfcgPB54y6p1vfxi6HKYjzpKXu4Ib+R09LU0g8tein0273ArcpDkgdP+65c
X-Received: by 10.28.26.200 with SMTP id a191mr2134294wma.27.1472816448767;
 Fri, 02 Sep 2016 04:40:48 -0700 (PDT)
Received: from localhost.localdomain ([160.165.63.90])
 by smtp.gmail.com with ESMTPSA id h7sm10088490wjd.17.2016.09.02.04.40.45
 (version=TLS1_2 cipher=ECDHE-RSA-AES128-SHA bits=128/128);
 Fri, 02 Sep 2016 04:40:48 -0700 (PDT)
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
To: edk2-devel@lists.01.org,
	liming.gao@intel.com
Cc: leif.lindholm@linaro.org, lersek@redhat.com, michael.d.kinney@intel.com,
 Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri,  2 Sep 2016 12:40:37 +0100
Message-Id: <1472816437-4525-3-git-send-email-ard.biesheuvel@linaro.org>
X-Mailer: git-send-email 2.7.4
In-Reply-To: <1472816437-4525-1-git-send-email-ard.biesheuvel@linaro.org>
References: <1472816437-4525-1-git-send-email-ard.biesheuvel@linaro.org>
Subject: [PATCH 2/2] MdePkg/BaseMemoryLibOptDxe: added accelerated AARCH64 routines
X-BeenThere: edk2-devel@lists.01.org
X-Mailman-Version: 2.1.21
Precedence: list
List-Id: EDK II Development  <edk2-devel.lists.01.org>
List-Unsubscribe: <https://lists.01.org/mailman/options/edk2-devel>,
 <mailto:edk2-devel-request@lists.01.org?subject=unsubscribe>
List-Archive: <http://lists.01.org/pipermail/edk2-devel/>
List-Post: <mailto:edk2-devel@lists.01.org>
List-Help: <mailto:edk2-devel-request@lists.01.org?subject=help>
List-Subscribe: <https://lists.01.org/mailman/listinfo/edk2-devel>,
 <mailto:edk2-devel-request@lists.01.org?subject=subscribe>
X-List-Received-Date: Fri, 02 Sep 2016 11:40:51 -0000

This adds AARCH64 support to BaseMemoryLibOptDxe, based on the cortex-strings
library. All string routines are accelerated except ScanMem16, ScanMem32,
ScanMem64 and IsZeroBuffer, which can wait for another day. (Very few
occurrences exist in the codebase)

Contributed-under: TianoCore Contribution Agreement 1.0
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CompareMem.S    | 149 +++++++++++
 MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S       | 283 ++++++++++++++++++++
 MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S       | 161 +++++++++++
 MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S        | 244 +++++++++++++++++
 MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf |  24 +-
 5 files changed, 847 insertions(+), 14 deletions(-)

diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CompareMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CompareMem.S
new file mode 100644
index 000000000000..e9a60263a902
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CompareMem.S
@@ -0,0 +1,149 @@
+//
+// Copyright (c) 2013, Linaro Limited
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in the
+//       documentation and/or other materials provided with the distribution.
+//     * Neither the name of the Linaro nor the
+//       names of its contributors may be used to endorse or promote products
+//       derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+// Assumptions:
+//
+// ARMv8-a, AArch64
+//
+
+
+// Parameters and result.
+#define src1      x0
+#define src2      x1
+#define limit     x2
+#define result    x0
+
+// Internal variables.
+#define data1     x3
+#define data1w    w3
+#define data2     x4
+#define data2w    w4
+#define has_nul   x5
+#define diff      x6
+#define endloop   x7
+#define tmp1      x8
+#define tmp2      x9
+#define tmp3      x10
+#define pos       x11
+#define limit_wd  x12
+#define mask      x13
+
+    .p2align 6
+ASM_GLOBAL ASM_PFX(InternalMemCompareMem)
+ASM_PFX(InternalMemCompareMem):
+    cbz     limit, .Lret0
+    eor     tmp1, src1, src2
+    tst     tmp1, #7
+    b.ne    .Lmisaligned8
+    ands    tmp1, src1, #7
+    b.ne    .Lmutual_align
+    add     limit_wd, limit, #7
+    lsr     limit_wd, limit_wd, #3
+
+    // Start of performance-critical section  -- one 64B cache line.
+.Lloop_aligned:
+    ldr     data1, [src1], #8
+    ldr     data2, [src2], #8
+.Lstart_realigned:
+    subs    limit_wd, limit_wd, #1
+    eor     diff, data1, data2        // Non-zero if differences found.
+    csinv   endloop, diff, xzr, ne    // Last Dword or differences.
+    cbz     endloop, .Lloop_aligned
+    // End of performance-critical section  -- one 64B cache line.
+
+    // Not reached the limit, must have found a diff.
+    cbnz    limit_wd, .Lnot_limit
+
+    // Limit % 8 == 0 => all bytes significant.
+    ands    limit, limit, #7
+    b.eq    .Lnot_limit
+
+    lsl     limit, limit, #3              // Bits -> bytes.
+    mov     mask, #~0
+    lsl     mask, mask, limit
+    bic     data1, data1, mask
+    bic     data2, data2, mask
+
+    orr     diff, diff, mask
+
+.Lnot_limit:
+    rev     diff, diff
+    rev     data1, data1
+    rev     data2, data2
+
+    // The MS-non-zero bit of DIFF marks either the first bit
+    // that is different, or the end of the significant data.
+    // Shifting left now will bring the critical information into the
+    // top bits.
+    clz     pos, diff
+    lsl     data1, data1, pos
+    lsl     data2, data2, pos
+
+    // But we need to zero-extend (char is unsigned) the value and then
+    // perform a signed 32-bit subtraction.
+    lsr     data1, data1, #56
+    sub     result, data1, data2, lsr #56
+    ret
+
+.Lmutual_align:
+    // Sources are mutually aligned, but are not currently at an
+    // alignment boundary.  Round down the addresses and then mask off
+    // the bytes that precede the start point.
+    bic     src1, src1, #7
+    bic     src2, src2, #7
+    add     limit, limit, tmp1          // Adjust the limit for the extra.
+    lsl     tmp1, tmp1, #3              // Bytes beyond alignment -> bits.
+    ldr     data1, [src1], #8
+    neg     tmp1, tmp1                  // Bits to alignment -64.
+    ldr     data2, [src2], #8
+    mov     tmp2, #~0
+
+    // Little-endian.  Early bytes are at LSB.
+    lsr     tmp2, tmp2, tmp1            // Shift (tmp1 & 63).
+    add     limit_wd, limit, #7
+    orr     data1, data1, tmp2
+    orr     data2, data2, tmp2
+    lsr     limit_wd, limit_wd, #3
+    b       .Lstart_realigned
+
+.Lret0:
+    mov     result, #0
+    ret
+
+    .p2align 6
+.Lmisaligned8:
+    sub     limit, limit, #1
+1:
+    // Perhaps we can do better than this.
+    ldrb    data1w, [src1], #1
+    ldrb    data2w, [src2], #1
+    subs    limit, limit, #1
+    ccmp    data1w, data2w, #0, cs      // NZCV = 0b0000.
+    b.eq    1b
+    sub     result, data1, data2
+    ret
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S
new file mode 100644
index 000000000000..9d812de8807b
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S
@@ -0,0 +1,283 @@
+//
+// Copyright (c) 2012 - 2016, Linaro Limited
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in the
+//       documentation and/or other materials provided with the distribution.
+//     * Neither the name of the Linaro nor the
+//       names of its contributors may be used to endorse or promote products
+//       derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+//
+// Copyright (c) 2015 ARM Ltd
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+// 3. The name of the company may not be used to endorse or promote
+//    products derived from this software without specific prior written
+//    permission.
+//
+// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+// Assumptions:
+//
+// ARMv8-a, AArch64, unaligned accesses.
+//
+//
+
+#define dstin     x0
+#define src       x1
+#define count     x2
+#define dst       x3
+#define srcend    x4
+#define dstend    x5
+#define A_l       x6
+#define A_lw      w6
+#define A_h       x7
+#define A_hw      w7
+#define B_l       x8
+#define B_lw      w8
+#define B_h       x9
+#define C_l       x10
+#define C_h       x11
+#define D_l       x12
+#define D_h       x13
+#define E_l       src
+#define E_h       count
+#define F_l       srcend
+#define F_h       dst
+#define tmp1      x9
+
+#define L(l) .L ## l
+
+// Copies are split into 3 main cases: small copies of up to 16 bytes,
+// medium copies of 17..96 bytes which are fully unrolled. Large copies
+// of more than 96 bytes align the destination and use an unrolled loop
+// processing 64 bytes per iteration.
+// Small and medium copies read all data before writing, allowing any
+// kind of overlap, and memmove tailcalls memcpy for these cases as
+// well as non-overlapping copies.
+
+__memcpy:
+    prfm    PLDL1KEEP, [src]
+    add     srcend, src, count
+    add     dstend, dstin, count
+    cmp     count, 16
+    b.ls    L(copy16)
+    cmp     count, 96
+    b.hi    L(copy_long)
+
+    // Medium copies: 17..96 bytes.
+    sub     tmp1, count, 1
+    ldp     A_l, A_h, [src]
+    tbnz    tmp1, 6, L(copy96)
+    ldp     D_l, D_h, [srcend, -16]
+    tbz     tmp1, 5, 1f
+    ldp     B_l, B_h, [src, 16]
+    ldp     C_l, C_h, [srcend, -32]
+    stp     B_l, B_h, [dstin, 16]
+    stp     C_l, C_h, [dstend, -32]
+1:
+    stp     A_l, A_h, [dstin]
+    stp     D_l, D_h, [dstend, -16]
+    ret
+
+    .p2align 4
+    // Small copies: 0..16 bytes.
+L(copy16):
+    cmp     count, 8
+    b.lo    1f
+    ldr     A_l, [src]
+    ldr     A_h, [srcend, -8]
+    str     A_l, [dstin]
+    str     A_h, [dstend, -8]
+    ret
+    .p2align 4
+1:
+    tbz     count, 2, 1f
+    ldr     A_lw, [src]
+    ldr     A_hw, [srcend, -4]
+    str     A_lw, [dstin]
+    str     A_hw, [dstend, -4]
+    ret
+
+    // Copy 0..3 bytes.  Use a branchless sequence that copies the same
+    // byte 3 times if count==1, or the 2nd byte twice if count==2.
+1:
+    cbz     count, 2f
+    lsr     tmp1, count, 1
+    ldrb    A_lw, [src]
+    ldrb    A_hw, [srcend, -1]
+    ldrb    B_lw, [src, tmp1]
+    strb    A_lw, [dstin]
+    strb    B_lw, [dstin, tmp1]
+    strb    A_hw, [dstend, -1]
+2:  ret
+
+    .p2align 4
+    // Copy 64..96 bytes.  Copy 64 bytes from the start and
+    // 32 bytes from the end.
+L(copy96):
+    ldp     B_l, B_h, [src, 16]
+    ldp     C_l, C_h, [src, 32]
+    ldp     D_l, D_h, [src, 48]
+    ldp     E_l, E_h, [srcend, -32]
+    ldp     F_l, F_h, [srcend, -16]
+    stp     A_l, A_h, [dstin]
+    stp     B_l, B_h, [dstin, 16]
+    stp     C_l, C_h, [dstin, 32]
+    stp     D_l, D_h, [dstin, 48]
+    stp     E_l, E_h, [dstend, -32]
+    stp     F_l, F_h, [dstend, -16]
+    ret
+
+    // Align DST to 16 byte alignment so that we don't cross cache line
+    // boundaries on both loads and stores. There are at least 96 bytes
+    // to copy, so copy 16 bytes unaligned and then align.	The loop
+    // copies 64 bytes per iteration and prefetches one iteration ahead.
+
+    .p2align 4
+L(copy_long):
+    and     tmp1, dstin, 15
+    bic     dst, dstin, 15
+    ldp     D_l, D_h, [src]
+    sub     src, src, tmp1
+    add     count, count, tmp1      // Count is now 16 too large.
+    ldp     A_l, A_h, [src, 16]
+    stp     D_l, D_h, [dstin]
+    ldp     B_l, B_h, [src, 32]
+    ldp     C_l, C_h, [src, 48]
+    ldp     D_l, D_h, [src, 64]!
+    subs    count, count, 128 + 16  // Test and readjust count.
+    b.ls    2f
+1:
+    stp     A_l, A_h, [dst, 16]
+    ldp     A_l, A_h, [src, 16]
+    stp     B_l, B_h, [dst, 32]
+    ldp     B_l, B_h, [src, 32]
+    stp     C_l, C_h, [dst, 48]
+    ldp     C_l, C_h, [src, 48]
+    stp     D_l, D_h, [dst, 64]!
+    ldp     D_l, D_h, [src, 64]!
+    subs    count, count, 64
+    b.hi    1b
+
+    // Write the last full set of 64 bytes.	 The remainder is at most 64
+    // bytes, so it is safe to always copy 64 bytes from the end even if
+    // there is just 1 byte left.
+2:
+    ldp     E_l, E_h, [srcend, -64]
+    stp     A_l, A_h, [dst, 16]
+    ldp     A_l, A_h, [srcend, -48]
+    stp     B_l, B_h, [dst, 32]
+    ldp     B_l, B_h, [srcend, -32]
+    stp     C_l, C_h, [dst, 48]
+    ldp     C_l, C_h, [srcend, -16]
+    stp     D_l, D_h, [dst, 64]
+    stp     E_l, E_h, [dstend, -64]
+    stp     A_l, A_h, [dstend, -48]
+    stp     B_l, B_h, [dstend, -32]
+    stp     C_l, C_h, [dstend, -16]
+    ret
+
+
+//
+// All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
+// Larger backwards copies are also handled by memcpy. The only remaining
+// case is forward large copies.  The destination is aligned, and an
+// unrolled loop processes 64 bytes per iteration.
+//
+
+ASM_GLOBAL ASM_PFX(InternalMemCopyMem)
+ASM_PFX(InternalMemCopyMem):
+    sub     tmp1, dstin, src
+    cmp     count, 96
+    ccmp    tmp1, count, 2, hi
+    b.hs    __memcpy
+
+    cbz     tmp1, 3f
+    add     dstend, dstin, count
+    add     srcend, src, count
+
+    // Align dstend to 16 byte alignment so that we don't cross cache line
+    // boundaries on both loads and stores. There are at least 96 bytes
+    // to copy, so copy 16 bytes unaligned and then align. The loop
+    // copies 64 bytes per iteration and prefetches one iteration ahead.
+
+    and     tmp1, dstend, 15
+    ldp     D_l, D_h, [srcend, -16]
+    sub     srcend, srcend, tmp1
+    sub     count, count, tmp1
+    ldp     A_l, A_h, [srcend, -16]
+    stp     D_l, D_h, [dstend, -16]
+    ldp     B_l, B_h, [srcend, -32]
+    ldp     C_l, C_h, [srcend, -48]
+    ldp     D_l, D_h, [srcend, -64]!
+    sub     dstend, dstend, tmp1
+    subs    count, count, 128
+    b.ls    2f
+    nop
+1:
+    stp     A_l, A_h, [dstend, -16]
+    ldp     A_l, A_h, [srcend, -16]
+    stp     B_l, B_h, [dstend, -32]
+    ldp     B_l, B_h, [srcend, -32]
+    stp     C_l, C_h, [dstend, -48]
+    ldp     C_l, C_h, [srcend, -48]
+    stp     D_l, D_h, [dstend, -64]!
+    ldp     D_l, D_h, [srcend, -64]!
+    subs    count, count, 64
+    b.hi    1b
+
+    // Write the last full set of 64 bytes. The remainder is at most 64
+    // bytes, so it is safe to always copy 64 bytes from the start even if
+    // there is just 1 byte left.
+2:
+    ldp     E_l, E_h, [src, 48]
+    stp     A_l, A_h, [dstend, -16]
+    ldp     A_l, A_h, [src, 32]
+    stp     B_l, B_h, [dstend, -32]
+    ldp     B_l, B_h, [src, 16]
+    stp     C_l, C_h, [dstend, -48]
+    ldp     C_l, C_h, [src]
+    stp     D_l, D_h, [dstend, -64]
+    stp     E_l, E_h, [dstin, 48]
+    stp     A_l, A_h, [dstin, 32]
+    stp     B_l, B_h, [dstin, 16]
+    stp     C_l, C_h, [dstin]
+3:  ret
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S
new file mode 100644
index 000000000000..e9029546d762
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S
@@ -0,0 +1,161 @@
+//
+// Copyright (c) 2014, ARM Limited
+// All rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in the
+//       documentation and/or other materials provided with the distribution.
+//     * Neither the name of the company nor the names of its contributors
+//       may be used to endorse or promote products derived from this
+//       software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+// Assumptions:
+//
+// ARMv8-a, AArch64
+// Neon Available.
+//
+
+// Arguments and results.
+#define srcin     x0
+#define cntin     x1
+#define chrin     w2
+
+#define result    x0
+
+#define src       x3
+#define	tmp       x4
+#define wtmp2     w5
+#define synd      x6
+#define soff      x9
+#define cntrem    x10
+
+#define vrepchr   v0
+#define vdata1    v1
+#define vdata2    v2
+#define vhas_chr1 v3
+#define vhas_chr2 v4
+#define vrepmask  v5
+#define vend      v6
+
+//
+// Core algorithm:
+//
+// For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+// per byte. For each tuple, bit 0 is set if the relevant byte matched the
+// requested character and bit 1 is not used (faster than using a 32bit
+// syndrome). Since the bits in the syndrome reflect exactly the order in which
+// things occur in the original string, counting trailing zeros allows to
+// identify exactly which byte has matched.
+//
+
+ASM_GLOBAL ASM_PFX(InternalMemScanMem8)
+ASM_PFX(InternalMemScanMem8):
+    // Do not dereference srcin if no bytes to compare.
+    cbz	cntin, .Lzero_length
+    //
+    // Magic constant 0x40100401 allows us to identify which lane matches
+    // the requested byte.
+    //
+    mov     wtmp2, #0x0401
+    movk    wtmp2, #0x4010, lsl #16
+    dup     vrepchr.16b, chrin
+    // Work with aligned 32-byte chunks
+    bic     src, srcin, #31
+    dup     vrepmask.4s, wtmp2
+    ands    soff, srcin, #31
+    and     cntrem, cntin, #31
+    b.eq    .Lloop
+
+    //
+    // Input string is not 32-byte aligned. We calculate the syndrome
+    // value for the aligned 32 bytes block containing the first bytes
+    // and mask the irrelevant part.
+    //
+
+    ld1     {vdata1.16b, vdata2.16b}, [src], #32
+    sub     tmp, soff, #32
+    adds    cntin, cntin, tmp
+    cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
+    cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
+    and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+    and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+    addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b        // 256->128
+    addp    vend.16b, vend.16b, vend.16b                  // 128->64
+    mov     synd, vend.2d[0]
+    // Clear the soff*2 lower bits
+    lsl     tmp, soff, #1
+    lsr     synd, synd, tmp
+    lsl     synd, synd, tmp
+    // The first block can also be the last
+    b.ls    .Lmasklast
+    // Have we found something already?
+    cbnz    synd, .Ltail
+
+.Lloop:
+    ld1     {vdata1.16b, vdata2.16b}, [src], #32
+    subs    cntin, cntin, #32
+    cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
+    cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
+    // If we're out of data we finish regardless of the result
+    b.ls    .Lend
+    // Use a fast check for the termination condition
+    orr     vend.16b, vhas_chr1.16b, vhas_chr2.16b
+    addp    vend.2d, vend.2d, vend.2d
+    mov     synd, vend.2d[0]
+    // We're not out of data, loop if we haven't found the character
+    cbz     synd, .Lloop
+
+.Lend:
+    // Termination condition found, let's calculate the syndrome value
+    and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+    and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+    addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b      // 256->128
+    addp    vend.16b, vend.16b, vend.16b                // 128->64
+    mov     synd, vend.2d[0]
+    // Only do the clear for the last possible block
+    b.hi    .Ltail
+
+.Lmasklast:
+    // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits
+    add     tmp, cntrem, soff
+    and     tmp, tmp, #31
+    sub     tmp, tmp, #32
+    neg     tmp, tmp, lsl #1
+    lsl     synd, synd, tmp
+    lsr     synd, synd, tmp
+
+.Ltail:
+    // Count the trailing zeros using bit reversing
+    rbit    synd, synd
+    // Compensate the last post-increment
+    sub     src, src, #32
+    // Check that we have found a character
+    cmp     synd, #0
+    // And count the leading zeros
+    clz     synd, synd
+    // Compute the potential result
+    add     result, src, synd, lsr #1
+    // Select result or NULL
+    csel    result, xzr, result, eq
+    ret
+
+.Lzero_length:
+    mov   result, #0
+    ret
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S
new file mode 100644
index 000000000000..7f361110d4fe
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S
@@ -0,0 +1,244 @@
+//
+// Copyright (c) 2012 - 2016, Linaro Limited
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in the
+//       documentation and/or other materials provided with the distribution.
+//     * Neither the name of the Linaro nor the
+//       names of its contributors may be used to endorse or promote products
+//       derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+//
+// Copyright (c) 2015 ARM Ltd
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+// 3. The name of the company may not be used to endorse or promote
+//    products derived from this software without specific prior written
+//    permission.
+//
+// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+// Assumptions:
+//
+// ARMv8-a, AArch64, unaligned accesses
+//
+//
+
+#define dstin     x0
+#define count     x1
+#define val       x2
+#define valw      w2
+#define dst       x3
+#define dstend    x4
+#define tmp1      x5
+#define tmp1w     w5
+#define tmp2      x6
+#define tmp2w     w6
+#define zva_len   x7
+#define zva_lenw  w7
+
+#define L(l) .L ## l
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem16)
+ASM_PFX(InternalMemSetMem16):
+    dup     v0.8H, valw
+    b       0f
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem32)
+ASM_PFX(InternalMemSetMem32):
+    dup     v0.4S, valw
+    b       0f
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem64)
+ASM_PFX(InternalMemSetMem64):
+    dup     v0.2D, val
+    b       0f
+
+ASM_GLOBAL ASM_PFX(InternalMemZeroMem)
+ASM_PFX(InternalMemZeroMem):
+    movi    v0.16B, #0
+    b       0f
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem)
+ASM_PFX(InternalMemSetMem):
+    dup     v0.16B, valw
+0:  add     dstend, dstin, count
+    mov     val, v0.D[0]
+
+    cmp     count, 96
+    b.hi    L(set_long)
+    cmp     count, 16
+    b.hs    L(set_medium)
+
+    // Set 0..15 bytes.
+    tbz     count, 3, 1f
+    str     val, [dstin]
+    str     val, [dstend, -8]
+    ret
+    nop
+1:  tbz     count, 2, 2f
+    str     valw, [dstin]
+    str     valw, [dstend, -4]
+    ret
+2:  cbz     count, 3f
+    strb    valw, [dstin]
+    tbz     count, 1, 3f
+    strh    valw, [dstend, -2]
+3:  ret
+
+    // Set 17..96 bytes.
+L(set_medium):
+    str     q0, [dstin]
+    tbnz    count, 6, L(set96)
+    str     q0, [dstend, -16]
+    tbz     count, 5, 1f
+    str     q0, [dstin, 16]
+    str     q0, [dstend, -32]
+1:  ret
+
+    .p2align 4
+    // Set 64..96 bytes.  Write 64 bytes from the start and
+    // 32 bytes from the end.
+L(set96):
+    str     q0, [dstin, 16]
+    stp     q0, q0, [dstin, 32]
+    stp     q0, q0, [dstend, -32]
+    ret
+
+    .p2align 3
+    nop
+L(set_long):
+    bic     dst, dstin, 15
+    str     q0, [dstin]
+    cmp     count, 256
+    ccmp    val, 0, 0, cs
+    b.eq    L(try_zva)
+L(no_zva):
+    sub     count, dstend, dst        // Count is 16 too large.
+    add     dst, dst, 16
+    sub     count, count, 64 + 16     // Adjust count and bias for loop.
+1:  stp     q0, q0, [dst], 64
+    stp     q0, q0, [dst, -32]
+L(tail64):
+    subs    count, count, 64
+    b.hi    1b
+2:  stp     q0, q0, [dstend, -64]
+    stp     q0, q0, [dstend, -32]
+    ret
+
+    .p2align 3
+L(try_zva):
+    mrs     tmp1, dczid_el0
+    tbnz    tmp1w, 4, L(no_zva)
+    and     tmp1w, tmp1w, 15
+    cmp     tmp1w, 4                  // ZVA size is 64 bytes.
+    b.ne    L(zva_128)
+
+    // Write the first and last 64 byte aligned block using stp rather
+    // than using DC ZVA.  This is faster on some cores.
+L(zva_64):
+    str     q0, [dst, 16]
+    stp     q0, q0, [dst, 32]
+    bic     dst, dst, 63
+    stp     q0, q0, [dst, 64]
+    stp     q0, q0, [dst, 96]
+    sub     count, dstend, dst         // Count is now 128 too large.
+    sub     count, count, 128+64+64    // Adjust count and bias for loop.
+    add     dst, dst, 128
+    nop
+1:  dc      zva, dst
+    add     dst, dst, 64
+    subs    count, count, 64
+    b.hi    1b
+    stp     q0, q0, [dst, 0]
+    stp     q0, q0, [dst, 32]
+    stp     q0, q0, [dstend, -64]
+    stp     q0, q0, [dstend, -32]
+    ret
+
+    .p2align 3
+L(zva_128):
+    cmp     tmp1w, 5                    // ZVA size is 128 bytes.
+    b.ne    L(zva_other)
+
+    str     q0, [dst, 16]
+    stp     q0, q0, [dst, 32]
+    stp     q0, q0, [dst, 64]
+    stp     q0, q0, [dst, 96]
+    bic     dst, dst, 127
+    sub     count, dstend, dst          // Count is now 128 too large.
+    sub     count, count, 128+128       // Adjust count and bias for loop.
+    add     dst, dst, 128
+1:  dc      zva, dst
+    add     dst, dst, 128
+    subs    count, count, 128
+    b.hi    1b
+    stp     q0, q0, [dstend, -128]
+    stp     q0, q0, [dstend, -96]
+    stp     q0, q0, [dstend, -64]
+    stp     q0, q0, [dstend, -32]
+    ret
+
+L(zva_other):
+    mov     tmp2w, 4
+    lsl     zva_lenw, tmp2w, tmp1w
+    add     tmp1, zva_len, 64           // Max alignment bytes written.
+    cmp     count, tmp1
+    blo     L(no_zva)
+
+    sub     tmp2, zva_len, 1
+    add     tmp1, dst, zva_len
+    add     dst, dst, 16
+    subs    count, tmp1, dst            // Actual alignment bytes to write.
+    bic     tmp1, tmp1, tmp2            // Aligned dc zva start address.
+    beq     2f
+1:  stp     q0, q0, [dst], 64
+    stp     q0, q0, [dst, -32]
+    subs    count, count, 64
+    b.hi    1b
+2:  mov     dst, tmp1
+    sub     count, dstend, tmp1         // Remaining bytes to write.
+    subs    count, count, zva_len
+    b.lo    4f
+3:  dc      zva, dst
+    add     dst, dst, zva_len
+    subs    count, count, zva_len
+    b.hs    3b
+4:  add     count, count, zva_len
+    b       L(tail64)
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf b/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf
index 71691b9859e3..4f69df0b0155 100644
--- a/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf
@@ -27,7 +27,7 @@ [Defines]
 
 
 #
-#  VALID_ARCHITECTURES           = IA32 X64
+#  VALID_ARCHITECTURES           = IA32 X64 AARCH64
 #
 
 [Sources]
@@ -79,19 +79,6 @@ [Sources.Ia32]
   Ia32/CopyMem.nasm
   Ia32/CopyMem.asm
   Ia32/IsZeroBuffer.nasm
-  ScanMem64Wrapper.c
-  ScanMem32Wrapper.c
-  ScanMem16Wrapper.c
-  ScanMem8Wrapper.c
-  ZeroMemWrapper.c
-  CompareMemWrapper.c
-  SetMem64Wrapper.c
-  SetMem32Wrapper.c
-  SetMem16Wrapper.c
-  SetMemWrapper.c
-  CopyMemWrapper.c
-  IsZeroBufferWrapper.c
-  MemLibGuid.c
 
 [Sources.X64]
   X64/ScanMem64.nasm
@@ -128,6 +115,15 @@ [Sources.X64]
   X64/CopyMem.asm
   X64/CopyMem.S
   X64/IsZeroBuffer.nasm
+
+[Sources.AARCH64]
+  AArch64/ScanMem.S
+  AArch64/ScanMemGeneric.c
+  AArch64/SetMem.S
+  AArch64/CopyMem.S
+  AArch64/CompareMem.S
+
+[Sources]
   ScanMem64Wrapper.c
   ScanMem32Wrapper.c
   ScanMem16Wrapper.c
-- 
2.7.4