* [PATCH v2 1/2] ArmVirtPkg/ArmPlatformLibQemu: Ensure that VFP is on before running C code
@ 2023-01-05 16:25 Ard Biesheuvel
2023-01-05 16:25 ` [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX Ard Biesheuvel
0 siblings, 1 reply; 37+ messages in thread
From: Ard Biesheuvel @ 2023-01-05 16:25 UTC (permalink / raw)
To: devel; +Cc: dann.frazier, Ard Biesheuvel
Now that we build the early code without strict alignment and without
suppressing the use of SIMD registers, ensure that the VFP unit is on
before entering C code.
While at it, simplyify the mov_i macro, which is only used for 32-bit
quantities.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S | 12 +++++-------
1 file changed, 5 insertions(+), 7 deletions(-)
diff --git a/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S b/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
index 05ccc7f9f043..1787d52fbf51 100644
--- a/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
+++ b/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
@@ -8,9 +8,7 @@
#include <AsmMacroIoLibV8.h>
.macro mov_i, reg:req, imm:req
- movz \reg, :abs_g3:\imm
- movk \reg, :abs_g2_nc:\imm
- movk \reg, :abs_g1_nc:\imm
+ movz \reg, :abs_g1:\imm
movk \reg, :abs_g0_nc:\imm
.endm
@@ -45,10 +43,9 @@
ASM_FUNC(ArmPlatformPeiBootAction)
mrs x0, CurrentEL // check current exception level
- tbz x0, #3, 0f // bail if above EL1
- ret
+ tbnz x0, #3, 0f // omit early ID map if above EL1
-0:mov_i x0, mairval
+ mov_i x0, mairval
mov_i x1, tcrval
adrp x2, idmap
orr x2, x2, #0xff << 48 // set non-zero ASID
@@ -87,7 +84,8 @@ ASM_FUNC(ArmPlatformPeiBootAction)
msr sctlr_el1, x3 // enable MMU and caches
isb
- ret
+
+0:b ArmEnableVFP // enable SIMD before entering C code
//UINTN
//ArmPlatformGetCorePosition (
--
2.39.0
^ permalink raw reply related [flat|nested] 37+ messages in thread
* [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-05 16:25 [PATCH v2 1/2] ArmVirtPkg/ArmPlatformLibQemu: Ensure that VFP is on before running C code Ard Biesheuvel
@ 2023-01-05 16:25 ` Ard Biesheuvel
2023-01-10 0:08 ` dann frazier
0 siblings, 1 reply; 37+ messages in thread
From: Ard Biesheuvel @ 2023-01-05 16:25 UTC (permalink / raw)
To: devel; +Cc: dann.frazier, Ard Biesheuvel
The early ID map used by ArmVirtQemu uses ASID scoped non-global
mappings, as this allows us to switch to the permanent ID map seamlessly
without the need for explicit TLB maintenance.
However, this triggers a known erratum on ThunderX, which does not
tolerate non-global mappings that are executable at EL1, as this appears
to result in I-cache corruption. (Linux disables the KPTI based Meltdown
mitigation on ThunderX for the same reason)
So work around this, by detecting the CPU implementor and part number,
and proceeding without the early ID map if a ThunderX CPU is detected.
Note that this requires the C code to be built with strict alignment
again, as we may end up executing it with the MMU and caches off.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
ArmVirtPkg/ArmVirtQemu.dsc | 5 +++++
ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S | 15 +++++++++++++++
2 files changed, 20 insertions(+)
diff --git a/ArmVirtPkg/ArmVirtQemu.dsc b/ArmVirtPkg/ArmVirtQemu.dsc
index f77443229e8e..5dd8b6104cca 100644
--- a/ArmVirtPkg/ArmVirtQemu.dsc
+++ b/ArmVirtPkg/ArmVirtQemu.dsc
@@ -31,6 +31,7 @@ [Defines]
DEFINE SECURE_BOOT_ENABLE = FALSE
DEFINE TPM2_ENABLE = FALSE
DEFINE TPM2_CONFIG_ENABLE = FALSE
+ DEFINE CAVIUM_ERRATUM_27456 = FALSE
#
# Network definition
@@ -117,7 +118,11 @@ [LibraryClasses.common.UEFI_DRIVER]
UefiScsiLib|MdePkg/Library/UefiScsiLib/UefiScsiLib.inf
[BuildOptions]
+!if $(CAVIUM_ERRATUM_27456) == TRUE
+ GCC:*_*_AARCH64_PP_FLAGS = -DCAVIUM_ERRATUM_27456
+!else
GCC:*_*_AARCH64_CC_XIPFLAGS ==
+!endif
!include NetworkPkg/NetworkBuildOptions.dsc.inc
diff --git a/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S b/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
index 1787d52fbf51..5ac7c732f6ec 100644
--- a/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
+++ b/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
@@ -42,6 +42,21 @@
ASM_FUNC(ArmPlatformPeiBootAction)
+#ifdef CAVIUM_ERRATUM_27456
+ /*
+ * On Cavium ThunderX, using non-global mappings that are executable at EL1
+ * results in I-cache corruption. So just avoid the early ID mapping there.
+ *
+ * MIDR implementor 0x43
+ * MIDR part numbers 0xA1 0xA2 (but not 0xAF)
+ */
+ mrs x0, midr_el1 // read the MIDR into X0
+ ubfx x1, x0, #24, #8 // grab implementor id
+ ubfx x0, x0, #7, #9 // grab part number bits [11:3]
+ cmp x1, #0x43 // compare implementor id
+ ccmp x0, #0xA0 >> 3, #0, eq // compare part# bits [11:3]
+ b.eq 0f
+#endif
mrs x0, CurrentEL // check current exception level
tbnz x0, #3, 0f // omit early ID map if above EL1
--
2.39.0
^ permalink raw reply related [flat|nested] 37+ messages in thread
* Re: [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-05 16:25 ` [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX Ard Biesheuvel
@ 2023-01-10 0:08 ` dann frazier
2023-01-17 12:47 ` [edk2-devel] " Oliver Steffen
0 siblings, 1 reply; 37+ messages in thread
From: dann frazier @ 2023-01-10 0:08 UTC (permalink / raw)
To: Ard Biesheuvel; +Cc: devel
On Thu, Jan 05, 2023 at 05:25:28PM +0100, Ard Biesheuvel wrote:
> The early ID map used by ArmVirtQemu uses ASID scoped non-global
> mappings, as this allows us to switch to the permanent ID map seamlessly
> without the need for explicit TLB maintenance.
>
> However, this triggers a known erratum on ThunderX, which does not
> tolerate non-global mappings that are executable at EL1, as this appears
> to result in I-cache corruption. (Linux disables the KPTI based Meltdown
> mitigation on ThunderX for the same reason)
>
> So work around this, by detecting the CPU implementor and part number,
> and proceeding without the early ID map if a ThunderX CPU is detected.
>
> Note that this requires the C code to be built with strict alignment
> again, as we may end up executing it with the MMU and caches off.
>
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
> ArmVirtPkg/ArmVirtQemu.dsc | 5 +++++
> ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S | 15 +++++++++++++++
> 2 files changed, 20 insertions(+)
FTR, this v2 series also worked for me.
-dann
> diff --git a/ArmVirtPkg/ArmVirtQemu.dsc b/ArmVirtPkg/ArmVirtQemu.dsc
> index f77443229e8e..5dd8b6104cca 100644
> --- a/ArmVirtPkg/ArmVirtQemu.dsc
> +++ b/ArmVirtPkg/ArmVirtQemu.dsc
> @@ -31,6 +31,7 @@ [Defines]
> DEFINE SECURE_BOOT_ENABLE = FALSE
> DEFINE TPM2_ENABLE = FALSE
> DEFINE TPM2_CONFIG_ENABLE = FALSE
> + DEFINE CAVIUM_ERRATUM_27456 = FALSE
>
> #
> # Network definition
> @@ -117,7 +118,11 @@ [LibraryClasses.common.UEFI_DRIVER]
> UefiScsiLib|MdePkg/Library/UefiScsiLib/UefiScsiLib.inf
>
> [BuildOptions]
> +!if $(CAVIUM_ERRATUM_27456) == TRUE
> + GCC:*_*_AARCH64_PP_FLAGS = -DCAVIUM_ERRATUM_27456
> +!else
> GCC:*_*_AARCH64_CC_XIPFLAGS ==
> +!endif
>
> !include NetworkPkg/NetworkBuildOptions.dsc.inc
>
> diff --git a/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S b/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> index 1787d52fbf51..5ac7c732f6ec 100644
> --- a/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> +++ b/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> @@ -42,6 +42,21 @@
>
>
> ASM_FUNC(ArmPlatformPeiBootAction)
> +#ifdef CAVIUM_ERRATUM_27456
> + /*
> + * On Cavium ThunderX, using non-global mappings that are executable at EL1
> + * results in I-cache corruption. So just avoid the early ID mapping there.
> + *
> + * MIDR implementor 0x43
> + * MIDR part numbers 0xA1 0xA2 (but not 0xAF)
> + */
> + mrs x0, midr_el1 // read the MIDR into X0
> + ubfx x1, x0, #24, #8 // grab implementor id
> + ubfx x0, x0, #7, #9 // grab part number bits [11:3]
> + cmp x1, #0x43 // compare implementor id
> + ccmp x0, #0xA0 >> 3, #0, eq // compare part# bits [11:3]
> + b.eq 0f
> +#endif
> mrs x0, CurrentEL // check current exception level
> tbnz x0, #3, 0f // omit early ID map if above EL1
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-10 0:08 ` dann frazier
@ 2023-01-17 12:47 ` Oliver Steffen
2023-01-17 14:53 ` Ard Biesheuvel
0 siblings, 1 reply; 37+ messages in thread
From: Oliver Steffen @ 2023-01-17 12:47 UTC (permalink / raw)
To: devel, dann.frazier; +Cc: Ard Biesheuvel
[-- Attachment #1: Type: text/plain, Size: 4405 bytes --]
Hi Ard, Hi everyone,
Thanks for the work!
But somehow this patch (as it was merged into master branch) does not
work for me on the ThunderX box we have.
Any idea what could be wrong?
I enabled the erratum during build ;-)
CPU Info:
# lscpu
Architecture: aarch64
CPU op-mode(s): 64-bit
Byte Order: Little Endian
CPU(s): 224
On-line CPU(s) list: 0-223
Vendor ID: Cavium
BIOS Vendor ID: Cavium Inc.
Model name: ThunderX2 99xx
BIOS Model name: Cavium ThunderX2(R) CPU CN9975 v2.2 @ 2.0GHz
Model: 2
Thread(s) per core: 4
Core(s) per socket: 28
Socket(s): 2
Stepping: 0x1
BogoMIPS: 400.00
Flags: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics
cpuid asimdrdm
Caches (sum of all):
L1d: 1.8 MiB (56 instances)
L1i: 1.8 MiB (56 instances)
L2: 14 MiB (56 instances)
L3: 64 MiB (2 instances)
[...]
Thanks a lot!
- Oliver
On Tue, Jan 10, 2023 at 1:08 AM dann frazier <dann.frazier@canonical.com>
wrote:
> On Thu, Jan 05, 2023 at 05:25:28PM +0100, Ard Biesheuvel wrote:
> > The early ID map used by ArmVirtQemu uses ASID scoped non-global
> > mappings, as this allows us to switch to the permanent ID map seamlessly
> > without the need for explicit TLB maintenance.
> >
> > However, this triggers a known erratum on ThunderX, which does not
> > tolerate non-global mappings that are executable at EL1, as this appears
> > to result in I-cache corruption. (Linux disables the KPTI based Meltdown
> > mitigation on ThunderX for the same reason)
> >
> > So work around this, by detecting the CPU implementor and part number,
> > and proceeding without the early ID map if a ThunderX CPU is detected.
> >
> > Note that this requires the C code to be built with strict alignment
> > again, as we may end up executing it with the MMU and caches off.
> >
> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > ---
> > ArmVirtPkg/ArmVirtQemu.dsc | 5
> +++++
> > ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S | 15
> +++++++++++++++
> > 2 files changed, 20 insertions(+)
>
> FTR, this v2 series also worked for me.
>
> -dann
>
> > diff --git a/ArmVirtPkg/ArmVirtQemu.dsc b/ArmVirtPkg/ArmVirtQemu.dsc
> > index f77443229e8e..5dd8b6104cca 100644
> > --- a/ArmVirtPkg/ArmVirtQemu.dsc
> > +++ b/ArmVirtPkg/ArmVirtQemu.dsc
> > @@ -31,6 +31,7 @@ [Defines]
> > DEFINE SECURE_BOOT_ENABLE = FALSE
> > DEFINE TPM2_ENABLE = FALSE
> > DEFINE TPM2_CONFIG_ENABLE = FALSE
> > + DEFINE CAVIUM_ERRATUM_27456 = FALSE
> >
> > #
> > # Network definition
> > @@ -117,7 +118,11 @@ [LibraryClasses.common.UEFI_DRIVER]
> > UefiScsiLib|MdePkg/Library/UefiScsiLib/UefiScsiLib.inf
> >
> > [BuildOptions]
> > +!if $(CAVIUM_ERRATUM_27456) == TRUE
> > + GCC:*_*_AARCH64_PP_FLAGS = -DCAVIUM_ERRATUM_27456
> > +!else
> > GCC:*_*_AARCH64_CC_XIPFLAGS ==
> > +!endif
> >
> > !include NetworkPkg/NetworkBuildOptions.dsc.inc
> >
> > diff --git
> a/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> b/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> > index 1787d52fbf51..5ac7c732f6ec 100644
> > --- a/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> > +++ b/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> > @@ -42,6 +42,21 @@
> >
> >
> > ASM_FUNC(ArmPlatformPeiBootAction)
> > +#ifdef CAVIUM_ERRATUM_27456
> > + /*
> > + * On Cavium ThunderX, using non-global mappings that are executable
> at EL1
> > + * results in I-cache corruption. So just avoid the early ID mapping
> there.
> > + *
> > + * MIDR implementor 0x43
> > + * MIDR part numbers 0xA1 0xA2 (but not 0xAF)
> > + */
> > + mrs x0, midr_el1 // read the MIDR into X0
> > + ubfx x1, x0, #24, #8 // grab implementor id
> > + ubfx x0, x0, #7, #9 // grab part number bits [11:3]
> > + cmp x1, #0x43 // compare implementor id
> > + ccmp x0, #0xA0 >> 3, #0, eq // compare part# bits [11:3]
> > + b.eq 0f
> > +#endif
> > mrs x0, CurrentEL // check current exception level
> > tbnz x0, #3, 0f // omit early ID map if above EL1
> >
>
>
>
>
>
>
[-- Attachment #2: Type: text/html, Size: 6463 bytes --]
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-17 12:47 ` [edk2-devel] " Oliver Steffen
@ 2023-01-17 14:53 ` Ard Biesheuvel
2023-01-18 6:36 ` Oliver Steffen
0 siblings, 1 reply; 37+ messages in thread
From: Ard Biesheuvel @ 2023-01-17 14:53 UTC (permalink / raw)
To: devel, osteffen; +Cc: dann.frazier
On Tue, 17 Jan 2023 at 13:48, Oliver Steffen <osteffen@redhat.com> wrote:
>
> Hi Ard, Hi everyone,
>
> Thanks for the work!
>
> But somehow this patch (as it was merged into master branch) does not
> work for me on the ThunderX box we have.
>
> Any idea what could be wrong?
I'm not sure I understand the question. The patch targets ThunderX,
and you are using a ThunderX2.
What were you expecting to happen, and what is happening instead?
> I enabled the erratum during build ;-)
>
> CPU Info:
> # lscpu
> Architecture: aarch64
> CPU op-mode(s): 64-bit
> Byte Order: Little Endian
> CPU(s): 224
> On-line CPU(s) list: 0-223
> Vendor ID: Cavium
> BIOS Vendor ID: Cavium Inc.
> Model name: ThunderX2 99xx
> BIOS Model name: Cavium ThunderX2(R) CPU CN9975 v2.2 @ 2.0GHz
> Model: 2
> Thread(s) per core: 4
> Core(s) per socket: 28
> Socket(s): 2
> Stepping: 0x1
> BogoMIPS: 400.00
> Flags: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics cpuid asimdrdm
> Caches (sum of all):
> L1d: 1.8 MiB (56 instances)
> L1i: 1.8 MiB (56 instances)
> L2: 14 MiB (56 instances)
> L3: 64 MiB (2 instances)
> [...]
>
> Thanks a lot!
> - Oliver
>
>
> On Tue, Jan 10, 2023 at 1:08 AM dann frazier <dann.frazier@canonical.com> wrote:
>>
>> On Thu, Jan 05, 2023 at 05:25:28PM +0100, Ard Biesheuvel wrote:
>> > The early ID map used by ArmVirtQemu uses ASID scoped non-global
>> > mappings, as this allows us to switch to the permanent ID map seamlessly
>> > without the need for explicit TLB maintenance.
>> >
>> > However, this triggers a known erratum on ThunderX, which does not
>> > tolerate non-global mappings that are executable at EL1, as this appears
>> > to result in I-cache corruption. (Linux disables the KPTI based Meltdown
>> > mitigation on ThunderX for the same reason)
>> >
>> > So work around this, by detecting the CPU implementor and part number,
>> > and proceeding without the early ID map if a ThunderX CPU is detected.
>> >
>> > Note that this requires the C code to be built with strict alignment
>> > again, as we may end up executing it with the MMU and caches off.
>> >
>> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
>> > ---
>> > ArmVirtPkg/ArmVirtQemu.dsc | 5 +++++
>> > ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S | 15 +++++++++++++++
>> > 2 files changed, 20 insertions(+)
>>
>> FTR, this v2 series also worked for me.
>>
>> -dann
>>
>> > diff --git a/ArmVirtPkg/ArmVirtQemu.dsc b/ArmVirtPkg/ArmVirtQemu.dsc
>> > index f77443229e8e..5dd8b6104cca 100644
>> > --- a/ArmVirtPkg/ArmVirtQemu.dsc
>> > +++ b/ArmVirtPkg/ArmVirtQemu.dsc
>> > @@ -31,6 +31,7 @@ [Defines]
>> > DEFINE SECURE_BOOT_ENABLE = FALSE
>> > DEFINE TPM2_ENABLE = FALSE
>> > DEFINE TPM2_CONFIG_ENABLE = FALSE
>> > + DEFINE CAVIUM_ERRATUM_27456 = FALSE
>> >
>> > #
>> > # Network definition
>> > @@ -117,7 +118,11 @@ [LibraryClasses.common.UEFI_DRIVER]
>> > UefiScsiLib|MdePkg/Library/UefiScsiLib/UefiScsiLib.inf
>> >
>> > [BuildOptions]
>> > +!if $(CAVIUM_ERRATUM_27456) == TRUE
>> > + GCC:*_*_AARCH64_PP_FLAGS = -DCAVIUM_ERRATUM_27456
>> > +!else
>> > GCC:*_*_AARCH64_CC_XIPFLAGS ==
>> > +!endif
>> >
>> > !include NetworkPkg/NetworkBuildOptions.dsc.inc
>> >
>> > diff --git a/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S b/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
>> > index 1787d52fbf51..5ac7c732f6ec 100644
>> > --- a/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
>> > +++ b/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
>> > @@ -42,6 +42,21 @@
>> >
>> >
>> > ASM_FUNC(ArmPlatformPeiBootAction)
>> > +#ifdef CAVIUM_ERRATUM_27456
>> > + /*
>> > + * On Cavium ThunderX, using non-global mappings that are executable at EL1
>> > + * results in I-cache corruption. So just avoid the early ID mapping there.
>> > + *
>> > + * MIDR implementor 0x43
>> > + * MIDR part numbers 0xA1 0xA2 (but not 0xAF)
>> > + */
>> > + mrs x0, midr_el1 // read the MIDR into X0
>> > + ubfx x1, x0, #24, #8 // grab implementor id
>> > + ubfx x0, x0, #7, #9 // grab part number bits [11:3]
>> > + cmp x1, #0x43 // compare implementor id
>> > + ccmp x0, #0xA0 >> 3, #0, eq // compare part# bits [11:3]
>> > + b.eq 0f
>> > +#endif
>> > mrs x0, CurrentEL // check current exception level
>> > tbnz x0, #3, 0f // omit early ID map if above EL1
>> >
>>
>>
>>
>>
>>
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-17 14:53 ` Ard Biesheuvel
@ 2023-01-18 6:36 ` Oliver Steffen
2023-01-18 7:34 ` Ard Biesheuvel
0 siblings, 1 reply; 37+ messages in thread
From: Oliver Steffen @ 2023-01-18 6:36 UTC (permalink / raw)
To: Ard Biesheuvel; +Cc: devel, dann.frazier
[-- Attachment #1: Type: text/plain, Size: 5508 bytes --]
On Tue, Jan 17, 2023 at 3:57 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> On Tue, 17 Jan 2023 at 13:48, Oliver Steffen <osteffen@redhat.com> wrote:
> >
> > Hi Ard, Hi everyone,
> >
> > Thanks for the work!
> >
> > But somehow this patch (as it was merged into master branch) does not
> > work for me on the ThunderX box we have.
> >
> > Any idea what could be wrong?
>
> I'm not sure I understand the question. The patch targets ThunderX,
> and you are using a ThunderX2.
>
> What were you expecting to happen, and what is happening instead?
Firmware does not start at all when using KVM.
Please excuse my limited knowledge of Arm processor variants.
I assumed that ThunderX and ThunderX2 are very similar and hoped
the fix would also work for this case.
The issue was introduced by the same commit that Dann
reported (07be1d34d95460a238fcd0f6693efb747c28b329):
"ArmVirtPkg/ArmVirtQemu: enable initial ID map at early boot".
> I enabled the erratum during build ;-)
> >
> > CPU Info:
> > # lscpu
> > Architecture: aarch64
> > CPU op-mode(s): 64-bit
> > Byte Order: Little Endian
> > CPU(s): 224
> > On-line CPU(s) list: 0-223
> > Vendor ID: Cavium
> > BIOS Vendor ID: Cavium Inc.
> > Model name: ThunderX2 99xx
> > BIOS Model name: Cavium ThunderX2(R) CPU CN9975 v2.2 @ 2.0GHz
> > Model: 2
> > Thread(s) per core: 4
> > Core(s) per socket: 28
> > Socket(s): 2
> > Stepping: 0x1
> > BogoMIPS: 400.00
> > Flags: fp asimd evtstrm aes pmull sha1 sha2 crc32
> atomics cpuid asimdrdm
> > Caches (sum of all):
> > L1d: 1.8 MiB (56 instances)
> > L1i: 1.8 MiB (56 instances)
> > L2: 14 MiB (56 instances)
> > L3: 64 MiB (2 instances)
> > [...]
> >
> > Thanks a lot!
> > - Oliver
> >
> >
> > On Tue, Jan 10, 2023 at 1:08 AM dann frazier <dann.frazier@canonical.com>
> wrote:
> >>
> >> On Thu, Jan 05, 2023 at 05:25:28PM +0100, Ard Biesheuvel wrote:
> >> > The early ID map used by ArmVirtQemu uses ASID scoped non-global
> >> > mappings, as this allows us to switch to the permanent ID map
> seamlessly
> >> > without the need for explicit TLB maintenance.
> >> >
> >> > However, this triggers a known erratum on ThunderX, which does not
> >> > tolerate non-global mappings that are executable at EL1, as this
> appears
> >> > to result in I-cache corruption. (Linux disables the KPTI based
> Meltdown
> >> > mitigation on ThunderX for the same reason)
> >> >
> >> > So work around this, by detecting the CPU implementor and part number,
> >> > and proceeding without the early ID map if a ThunderX CPU is detected.
> >> >
> >> > Note that this requires the C code to be built with strict alignment
> >> > again, as we may end up executing it with the MMU and caches off.
> >> >
> >> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> >> > ---
> >> > ArmVirtPkg/ArmVirtQemu.dsc |
> 5 +++++
> >> > ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S |
> 15 +++++++++++++++
> >> > 2 files changed, 20 insertions(+)
> >>
> >> FTR, this v2 series also worked for me.
> >>
> >> -dann
> >>
> >> > diff --git a/ArmVirtPkg/ArmVirtQemu.dsc b/ArmVirtPkg/ArmVirtQemu.dsc
> >> > index f77443229e8e..5dd8b6104cca 100644
> >> > --- a/ArmVirtPkg/ArmVirtQemu.dsc
> >> > +++ b/ArmVirtPkg/ArmVirtQemu.dsc
> >> > @@ -31,6 +31,7 @@ [Defines]
> >> > DEFINE SECURE_BOOT_ENABLE = FALSE
> >> > DEFINE TPM2_ENABLE = FALSE
> >> > DEFINE TPM2_CONFIG_ENABLE = FALSE
> >> > + DEFINE CAVIUM_ERRATUM_27456 = FALSE
> >> >
> >> > #
> >> > # Network definition
> >> > @@ -117,7 +118,11 @@ [LibraryClasses.common.UEFI_DRIVER]
> >> > UefiScsiLib|MdePkg/Library/UefiScsiLib/UefiScsiLib.inf
> >> >
> >> > [BuildOptions]
> >> > +!if $(CAVIUM_ERRATUM_27456) == TRUE
> >> > + GCC:*_*_AARCH64_PP_FLAGS = -DCAVIUM_ERRATUM_27456
> >> > +!else
> >> > GCC:*_*_AARCH64_CC_XIPFLAGS ==
> >> > +!endif
> >> >
> >> > !include NetworkPkg/NetworkBuildOptions.dsc.inc
> >> >
> >> > diff --git
> a/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> b/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> >> > index 1787d52fbf51..5ac7c732f6ec 100644
> >> > ---
> a/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> >> > +++
> b/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> >> > @@ -42,6 +42,21 @@
> >> >
> >> >
> >> > ASM_FUNC(ArmPlatformPeiBootAction)
> >> > +#ifdef CAVIUM_ERRATUM_27456
> >> > + /*
> >> > + * On Cavium ThunderX, using non-global mappings that are
> executable at EL1
> >> > + * results in I-cache corruption. So just avoid the early ID
> mapping there.
> >> > + *
> >> > + * MIDR implementor 0x43
> >> > + * MIDR part numbers 0xA1 0xA2 (but not 0xAF)
> >> > + */
> >> > + mrs x0, midr_el1 // read the MIDR into X0
> >> > + ubfx x1, x0, #24, #8 // grab implementor id
> >> > + ubfx x0, x0, #7, #9 // grab part number bits [11:3]
> >> > + cmp x1, #0x43 // compare implementor id
> >> > + ccmp x0, #0xA0 >> 3, #0, eq // compare part# bits [11:3]
> >> > + b.eq 0f
> >> > +#endif
> >> > mrs x0, CurrentEL // check current exception level
> >> > tbnz x0, #3, 0f // omit early ID map if above EL1
> >> >
>
>
[-- Attachment #2: Type: text/html, Size: 8163 bytes --]
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-18 6:36 ` Oliver Steffen
@ 2023-01-18 7:34 ` Ard Biesheuvel
2023-01-18 8:27 ` Oliver Steffen
0 siblings, 1 reply; 37+ messages in thread
From: Ard Biesheuvel @ 2023-01-18 7:34 UTC (permalink / raw)
To: Oliver Steffen; +Cc: devel, dann.frazier
On Wed, 18 Jan 2023 at 07:37, Oliver Steffen <osteffen@redhat.com> wrote:
>
> On Tue, Jan 17, 2023 at 3:57 PM Ard Biesheuvel <ardb@kernel.org> wrote:
>>
>> On Tue, 17 Jan 2023 at 13:48, Oliver Steffen <osteffen@redhat.com> wrote:
>> >
>> > Hi Ard, Hi everyone,
>> >
>> > Thanks for the work!
>> >
>> > But somehow this patch (as it was merged into master branch) does not
>> > work for me on the ThunderX box we have.
>> >
>> > Any idea what could be wrong?
>>
>> I'm not sure I understand the question. The patch targets ThunderX,
>> and you are using a ThunderX2.
>>
>> What were you expecting to happen, and what is happening instead?
>
>
> Firmware does not start at all when using KVM.
>
> Please excuse my limited knowledge of Arm processor variants.
> I assumed that ThunderX and ThunderX2 are very similar and hoped
> the fix would also work for this case.
>
> The issue was introduced by the same commit that Dann
> reported (07be1d34d95460a238fcd0f6693efb747c28b329):
> "ArmVirtPkg/ArmVirtQemu: enable initial ID map at early boot".
>
Can you share the QEMU command line that you are using? I use a
ThunderX2 basically 24/7 to do all my Linux and EDK2 development, so
this change was developed on ThunderX2 and so I'm surprised you are
seeing this issue.
Did you try the DEBUG build as well?
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-18 7:34 ` Ard Biesheuvel
@ 2023-01-18 8:27 ` Oliver Steffen
2023-01-18 8:48 ` Ard Biesheuvel
0 siblings, 1 reply; 37+ messages in thread
From: Oliver Steffen @ 2023-01-18 8:27 UTC (permalink / raw)
To: Ard Biesheuvel; +Cc: devel, dann.frazier
Quoting Ard Biesheuvel (2023-01-18 08:34:32)
> On Wed, 18 Jan 2023 at 07:37, Oliver Steffen <osteffen@redhat.com> wrote:
> >
> > On Tue, Jan 17, 2023 at 3:57 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> >>
> >> On Tue, 17 Jan 2023 at 13:48, Oliver Steffen <osteffen@redhat.com> wrote:
> >> >
> >> > Hi Ard, Hi everyone,
> >> >
> >> > Thanks for the work!
> >> >
> >> > But somehow this patch (as it was merged into master branch) does not
> >> > work for me on the ThunderX box we have.
> >> >
> >> > Any idea what could be wrong?
> >>
> >> I'm not sure I understand the question. The patch targets ThunderX,
> >> and you are using a ThunderX2.
> >>
> >> What were you expecting to happen, and what is happening instead?
> >
> >
> > Firmware does not start at all when using KVM.
> >
> > Please excuse my limited knowledge of Arm processor variants.
> > I assumed that ThunderX and ThunderX2 are very similar and hoped
> > the fix would also work for this case.
> >
> > The issue was introduced by the same commit that Dann
> > reported (07be1d34d95460a238fcd0f6693efb747c28b329):
> > "ArmVirtPkg/ArmVirtQemu: enable initial ID map at early boot".
> >
>
> Can you share the QEMU command line that you are using? I use a
> ThunderX2 basically 24/7 to do all my Linux and EDK2 development, so
> this change was developed on ThunderX2 and so I'm surprised you are
> seeing this issue.
>
> Did you try the DEBUG build as well?
Yes, debug is on.
Here is what I have, trying with the master branch from just now
(998ebe5ca0ae5c449e83ede533bee872f97d63af):
# make -C BaseTools && \
. ./edksetup.sh && \
build -t GCC5 -a AARCH64 \
-p ArmVirtPkg/ArmVirtQemu.dsc \
-DCAVIUM_ERRATUM_27456 \
-b DEBUG
# /usr/libexec/qemu-kvm \
-machine accel=kvm -m 1G -boot menu=on \
-blockdev node-name=code,driver=file,filename="${FW_CODE_RESIZED}",read-only=on
\
-blockdev node-name=vars,driver=file,filename="${FW_VARS}" \
-machine pflash0=code \
-machine pflash1=vars \
-cpu max \
-net none \
-serial stdio
# /usr/libexec/qemu-kvm --version
QEMU emulator version 7.2.0 (qemu-kvm-7.2.0-3.el9)
# uname -r
5.14.0-234.el9.aarch64
# gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/libexec/gcc/aarch64-redhat-linux/11/lto-wrapper
Target: aarch64-redhat-linux
Configured with: ../configure --enable-bootstrap --enable-host-pie
--enable-host-bind-now --enable-languages=c,c++,fortran,lto
--prefix=/usr --mandir=/usr/share/man --infodir=/usr/share/info
--with-bugurl=http://bugzilla.redhat.com/bugzilla --enable-shared
--enable-threads=posix --enable-checking=release --with-system-zlib
--enable-__cxa_atexit --disable-libunwind-exceptions
--enable-gnu-unique-object --enable-linker-build-id
--with-gcc-major-version-only --enable-plugin --enable-initfini-array
--without-isl --enable-multilib --with-linker-hash-style=gnu
--enable-gnu-indirect-function --build=aarch64-redhat-linux
--with-build-config=bootstrap-lto --enable-link-serialization=1
Thread model: posix
Supported LTO compression algorithms: zlib zstd
gcc version 11.3.1 20221121 (Red Hat 11.3.1-4) (GCC)
Since you have the same CPU... Might this be a bug in KVM?
Thanks!
-Oliver
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-18 8:27 ` Oliver Steffen
@ 2023-01-18 8:48 ` Ard Biesheuvel
2023-01-18 9:22 ` Ard Biesheuvel
0 siblings, 1 reply; 37+ messages in thread
From: Ard Biesheuvel @ 2023-01-18 8:48 UTC (permalink / raw)
To: Oliver Steffen; +Cc: devel, dann.frazier
On Wed, 18 Jan 2023 at 09:28, Oliver Steffen <osteffen@redhat.com> wrote:
>
> Quoting Ard Biesheuvel (2023-01-18 08:34:32)
> > On Wed, 18 Jan 2023 at 07:37, Oliver Steffen <osteffen@redhat.com> wrote:
> > >
> > > On Tue, Jan 17, 2023 at 3:57 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> > >>
> > >> On Tue, 17 Jan 2023 at 13:48, Oliver Steffen <osteffen@redhat.com> wrote:
> > >> >
> > >> > Hi Ard, Hi everyone,
> > >> >
> > >> > Thanks for the work!
> > >> >
> > >> > But somehow this patch (as it was merged into master branch) does not
> > >> > work for me on the ThunderX box we have.
> > >> >
> > >> > Any idea what could be wrong?
> > >>
> > >> I'm not sure I understand the question. The patch targets ThunderX,
> > >> and you are using a ThunderX2.
> > >>
> > >> What were you expecting to happen, and what is happening instead?
> > >
> > >
> > > Firmware does not start at all when using KVM.
> > >
> > > Please excuse my limited knowledge of Arm processor variants.
> > > I assumed that ThunderX and ThunderX2 are very similar and hoped
> > > the fix would also work for this case.
> > >
> > > The issue was introduced by the same commit that Dann
> > > reported (07be1d34d95460a238fcd0f6693efb747c28b329):
> > > "ArmVirtPkg/ArmVirtQemu: enable initial ID map at early boot".
> > >
> >
> > Can you share the QEMU command line that you are using? I use a
> > ThunderX2 basically 24/7 to do all my Linux and EDK2 development, so
> > this change was developed on ThunderX2 and so I'm surprised you are
> > seeing this issue.
> >
> > Did you try the DEBUG build as well?
> Yes, debug is on.
>
> Here is what I have, trying with the master branch from just now
> (998ebe5ca0ae5c449e83ede533bee872f97d63af):
>
> # make -C BaseTools && \
> . ./edksetup.sh && \
> build -t GCC5 -a AARCH64 \
> -p ArmVirtPkg/ArmVirtQemu.dsc \
> -DCAVIUM_ERRATUM_27456 \
> -b DEBUG
>
> # /usr/libexec/qemu-kvm \
> -machine accel=kvm -m 1G -boot menu=on \
> -blockdev node-name=code,driver=file,filename="${FW_CODE_RESIZED}",read-only=on
> \
> -blockdev node-name=vars,driver=file,filename="${FW_VARS}" \
> -machine pflash0=code \
> -machine pflash1=vars \
> -cpu max \
> -net none \
> -serial stdio
>
My distro does not have qemu-kvm, and using the command line above
results in the following if i try it with qemu-system-aarch64
"""
qemu-system-aarch64: No machine specified, and there is no default
Use -machine help to list supported machines
"""
unless i change it to
qemu-system-aarch64 -machine virt,accel=kvm -m 1G -boot menu=on \
-blockdev node-name=code,driver=file,filename=$HOME/bin/flash0.img,read-only=on
\
-blockdev node-name=vars,driver=file,filename=$HOME/bin/flash1.img \
-machine pflash0=code \
-machine pflash1=vars \
-cpu max \
-net none \
-nographic
and that works fine with my firmware build.
> # /usr/libexec/qemu-kvm --version
> QEMU emulator version 7.2.0 (qemu-kvm-7.2.0-3.el9)
>
> # uname -r
> 5.14.0-234.el9.aarch64
>
Yeah, that is quite old. One potential issue that comes to mind here
is the one address by the patch below
>
>
> Since you have the same CPU... Might this be a bug in KVM?
>
Indeed. Could you try applying this patch?
commit 406504c7b0405d74d74c15a667cd4c4620c3e7a9
Author: Marc Zyngier <maz@kernel.org>
Date: Tue Dec 20 14:03:52 2022 +0000
KVM: arm64: Fix S1PTW handling on RO memslots
Or check whether this is generally reproducible with newer kernels?
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-18 8:48 ` Ard Biesheuvel
@ 2023-01-18 9:22 ` Ard Biesheuvel
2023-01-19 11:03 ` Oliver Steffen
0 siblings, 1 reply; 37+ messages in thread
From: Ard Biesheuvel @ 2023-01-18 9:22 UTC (permalink / raw)
To: Oliver Steffen; +Cc: devel, dann.frazier
On Wed, 18 Jan 2023 at 09:48, Ard Biesheuvel <ardb@kernel.org> wrote:
>
> On Wed, 18 Jan 2023 at 09:28, Oliver Steffen <osteffen@redhat.com> wrote:
> >
> > Quoting Ard Biesheuvel (2023-01-18 08:34:32)
> > > On Wed, 18 Jan 2023 at 07:37, Oliver Steffen <osteffen@redhat.com> wrote:
> > > >
> > > > On Tue, Jan 17, 2023 at 3:57 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> > > >>
> > > >> On Tue, 17 Jan 2023 at 13:48, Oliver Steffen <osteffen@redhat.com> wrote:
> > > >> >
> > > >> > Hi Ard, Hi everyone,
> > > >> >
> > > >> > Thanks for the work!
> > > >> >
> > > >> > But somehow this patch (as it was merged into master branch) does not
> > > >> > work for me on the ThunderX box we have.
> > > >> >
> > > >> > Any idea what could be wrong?
> > > >>
> > > >> I'm not sure I understand the question. The patch targets ThunderX,
> > > >> and you are using a ThunderX2.
> > > >>
> > > >> What were you expecting to happen, and what is happening instead?
> > > >
> > > >
> > > > Firmware does not start at all when using KVM.
> > > >
> > > > Please excuse my limited knowledge of Arm processor variants.
> > > > I assumed that ThunderX and ThunderX2 are very similar and hoped
> > > > the fix would also work for this case.
> > > >
> > > > The issue was introduced by the same commit that Dann
> > > > reported (07be1d34d95460a238fcd0f6693efb747c28b329):
> > > > "ArmVirtPkg/ArmVirtQemu: enable initial ID map at early boot".
> > > >
> > >
> > > Can you share the QEMU command line that you are using? I use a
> > > ThunderX2 basically 24/7 to do all my Linux and EDK2 development, so
> > > this change was developed on ThunderX2 and so I'm surprised you are
> > > seeing this issue.
> > >
> > > Did you try the DEBUG build as well?
> > Yes, debug is on.
> >
> > Here is what I have, trying with the master branch from just now
> > (998ebe5ca0ae5c449e83ede533bee872f97d63af):
> >
> > # make -C BaseTools && \
> > . ./edksetup.sh && \
> > build -t GCC5 -a AARCH64 \
> > -p ArmVirtPkg/ArmVirtQemu.dsc \
> > -DCAVIUM_ERRATUM_27456 \
> > -b DEBUG
> >
> > # /usr/libexec/qemu-kvm \
> > -machine accel=kvm -m 1G -boot menu=on \
> > -blockdev node-name=code,driver=file,filename="${FW_CODE_RESIZED}",read-only=on
> > \
> > -blockdev node-name=vars,driver=file,filename="${FW_VARS}" \
> > -machine pflash0=code \
> > -machine pflash1=vars \
> > -cpu max \
> > -net none \
> > -serial stdio
> >
>
> My distro does not have qemu-kvm, and using the command line above
> results in the following if i try it with qemu-system-aarch64
>
> """
> qemu-system-aarch64: No machine specified, and there is no default
> Use -machine help to list supported machines
> """
>
> unless i change it to
>
> qemu-system-aarch64 -machine virt,accel=kvm -m 1G -boot menu=on \
> -blockdev node-name=code,driver=file,filename=$HOME/bin/flash0.img,read-only=on
> \
> -blockdev node-name=vars,driver=file,filename=$HOME/bin/flash1.img \
> -machine pflash0=code \
> -machine pflash1=vars \
> -cpu max \
> -net none \
> -nographic
>
> and that works fine with my firmware build.
>
>
> > # /usr/libexec/qemu-kvm --version
> > QEMU emulator version 7.2.0 (qemu-kvm-7.2.0-3.el9)
> >
> > # uname -r
> > 5.14.0-234.el9.aarch64
> >
>
> Yeah, that is quite old. One potential issue that comes to mind here
> is the one address by the patch below
>
>
> >
> >
> > Since you have the same CPU... Might this be a bug in KVM?
> >
>
> Indeed. Could you try applying this patch?
>
> commit 406504c7b0405d74d74c15a667cd4c4620c3e7a9
> Author: Marc Zyngier <maz@kernel.org>
> Date: Tue Dec 20 14:03:52 2022 +0000
>
> KVM: arm64: Fix S1PTW handling on RO memslots
>
> Or check whether this is generally reproducible with newer kernels?
Another thing you might try:
- build the firmware with the following hunk applied
"""
diff --git a/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
b/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
index 5ac7c732f6ec..f4e1285beefc 100644
--- a/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
+++ b/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
@@ -40,6 +40,12 @@
.set sctlrval, SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA |
SCTLR_EL1_ITD | SCTLR_EL1_SED
.set sctlrval, sctlrval | SCTLR_ELx_I | SCTLR_EL1_SPAN | SCTLR_EL1_RES1
+ .align 11
+.Lvectors:
+ .rept 16
+ .align 7
+ b .
+ .endr
ASM_FUNC(ArmPlatformPeiBootAction)
#ifdef CAVIUM_ERRATUM_27456
@@ -90,6 +96,8 @@ ASM_FUNC(ArmPlatformPeiBootAction)
msr mair_el1, x0 // set up the 1:1 mapping
msr tcr_el1, x1
msr ttbr0_el1, x2
+ adr x0, .Lvectors
+ msr vbar_el1, x0
isb
tlbi vmalle1 // invalidate any cached translations
"""
- run qemu with the -s option and let it crash
- connect with gdb and dump the exception context
target remote:1234
set radix 16
p $FAR_EL1
p $ESR_EL1
p $ELR_EL1
That should at least tell us why the crash is occurring.
^ permalink raw reply related [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-18 9:22 ` Ard Biesheuvel
@ 2023-01-19 11:03 ` Oliver Steffen
2023-01-19 11:11 ` Ard Biesheuvel
0 siblings, 1 reply; 37+ messages in thread
From: Oliver Steffen @ 2023-01-19 11:03 UTC (permalink / raw)
To: Ard Biesheuvel, devel; +Cc: dann.frazier, kraxel
Quoting Ard Biesheuvel (2023-01-18 10:22:12)
> On Wed, 18 Jan 2023 at 09:48, Ard Biesheuvel <ardb@kernel.org> wrote:
> >
> > On Wed, 18 Jan 2023 at 09:28, Oliver Steffen <osteffen@redhat.com> wrote:
> > >
> > > Quoting Ard Biesheuvel (2023-01-18 08:34:32)
> > > > On Wed, 18 Jan 2023 at 07:37, Oliver Steffen <osteffen@redhat.com> wrote:
> > > > >
> > > > > On Tue, Jan 17, 2023 at 3:57 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> > > > >>
> > > > >> On Tue, 17 Jan 2023 at 13:48, Oliver Steffen <osteffen@redhat.com> wrote:
> > > > >> >
> > > > >> > Hi Ard, Hi everyone,
> > > > >> >
> > > > >> > Thanks for the work!
> > > > >> >
> > > > >> > But somehow this patch (as it was merged into master branch) does not
> > > > >> > work for me on the ThunderX box we have.
> > > > >> >
> > > > >> > Any idea what could be wrong?
> > > > >>
> > > > >> I'm not sure I understand the question. The patch targets ThunderX,
> > > > >> and you are using a ThunderX2.
> > > > >>
> > > > >> What were you expecting to happen, and what is happening instead?
> > > > >
> > > > >
> > > > > Firmware does not start at all when using KVM.
> > > > >
> > > > > Please excuse my limited knowledge of Arm processor variants.
> > > > > I assumed that ThunderX and ThunderX2 are very similar and hoped
> > > > > the fix would also work for this case.
> > > > >
> > > > > The issue was introduced by the same commit that Dann
> > > > > reported (07be1d34d95460a238fcd0f6693efb747c28b329):
> > > > > "ArmVirtPkg/ArmVirtQemu: enable initial ID map at early boot".
> > > > >
> > > >
> > > > Can you share the QEMU command line that you are using? I use a
> > > > ThunderX2 basically 24/7 to do all my Linux and EDK2 development, so
> > > > this change was developed on ThunderX2 and so I'm surprised you are
> > > > seeing this issue.
> > > >
> > > > Did you try the DEBUG build as well?
> > > Yes, debug is on.
> > >
> > > Here is what I have, trying with the master branch from just now
> > > (998ebe5ca0ae5c449e83ede533bee872f97d63af):
> > >
> > > # make -C BaseTools && \
> > > . ./edksetup.sh && \
> > > build -t GCC5 -a AARCH64 \
> > > -p ArmVirtPkg/ArmVirtQemu.dsc \
> > > -DCAVIUM_ERRATUM_27456 \
> > > -b DEBUG
> > >
> > > # /usr/libexec/qemu-kvm \
> > > -machine accel=kvm -m 1G -boot menu=on \
> > > -blockdev node-name=code,driver=file,filename="${FW_CODE_RESIZED}",read-only=on
> > > \
> > > -blockdev node-name=vars,driver=file,filename="${FW_VARS}" \
> > > -machine pflash0=code \
> > > -machine pflash1=vars \
> > > -cpu max \
> > > -net none \
> > > -serial stdio
> > >
> >
> > My distro does not have qemu-kvm, and using the command line above
> > results in the following if i try it with qemu-system-aarch64
> >
> > """
> > qemu-system-aarch64: No machine specified, and there is no default
> > Use -machine help to list supported machines
> > """
> >
> > unless i change it to
> >
> > qemu-system-aarch64 -machine virt,accel=kvm -m 1G -boot menu=on \
> > -blockdev node-name=code,driver=file,filename=$HOME/bin/flash0.img,read-only=on
> > \
> > -blockdev node-name=vars,driver=file,filename=$HOME/bin/flash1.img \
> > -machine pflash0=code \
> > -machine pflash1=vars \
> > -cpu max \
> > -net none \
> > -nographic
> >
> > and that works fine with my firmware build.
> >
> >
> > > # /usr/libexec/qemu-kvm --version
> > > QEMU emulator version 7.2.0 (qemu-kvm-7.2.0-3.el9)
> > >
> > > # uname -r
> > > 5.14.0-234.el9.aarch64
> > >
> >
> > Yeah, that is quite old. One potential issue that comes to mind here
> > is the one address by the patch below
> >
> >
> > >
> > >
> > > Since you have the same CPU... Might this be a bug in KVM?
> > >
> >
> > Indeed. Could you try applying this patch?
> >
> > commit 406504c7b0405d74d74c15a667cd4c4620c3e7a9
> > Author: Marc Zyngier <maz@kernel.org>
> > Date: Tue Dec 20 14:03:52 2022 +0000
> >
> > KVM: arm64: Fix S1PTW handling on RO memslots
> >
> > Or check whether this is generally reproducible with newer kernels?
>
> Another thing you might try:
>
> - build the firmware with the following hunk applied
>
> """
> diff --git a/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> b/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> index 5ac7c732f6ec..f4e1285beefc 100644
> --- a/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> +++ b/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> @@ -40,6 +40,12 @@
> .set sctlrval, SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA |
> SCTLR_EL1_ITD | SCTLR_EL1_SED
> .set sctlrval, sctlrval | SCTLR_ELx_I | SCTLR_EL1_SPAN | SCTLR_EL1_RES1
>
> + .align 11
> +.Lvectors:
> + .rept 16
> + .align 7
> + b .
> + .endr
>
> ASM_FUNC(ArmPlatformPeiBootAction)
> #ifdef CAVIUM_ERRATUM_27456
> @@ -90,6 +96,8 @@ ASM_FUNC(ArmPlatformPeiBootAction)
> msr mair_el1, x0 // set up the 1:1 mapping
> msr tcr_el1, x1
> msr ttbr0_el1, x2
> + adr x0, .Lvectors
> + msr vbar_el1, x0
> isb
>
> tlbi vmalle1 // invalidate any cached translations
> """
>
> - run qemu with the -s option and let it crash
>
> - connect with gdb and dump the exception context
>
> target remote:1234
> set radix 16
> p $FAR_EL1
> p $ESR_EL1
> p $ELR_EL1
>
> That should at least tell us why the crash is occurring.
>
I tried the most recent Qemu master (v7.2.50) and also v7.0.0,
on the 5.14 (RHEL) kernel and on 6.1.6-200.fc37.aarch64 (from Fedora).
No luck.
I applied the patch and attached gdb, as described (Qemu 7.2.50):
p $ELR_EL1
(gdb) p $FAR_EL1
$1 = 0x6200
(gdb) p $ESR_EL1
$2 = 0x86000010
(gdb) p $ELR_EL1
$3 = 0x6200
There is no sign of any crash. It seems like it does not even start
running.
Thanks,
Oliver
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-19 11:03 ` Oliver Steffen
@ 2023-01-19 11:11 ` Ard Biesheuvel
2023-01-19 11:25 ` Oliver Steffen
` (2 more replies)
0 siblings, 3 replies; 37+ messages in thread
From: Ard Biesheuvel @ 2023-01-19 11:11 UTC (permalink / raw)
To: Oliver Steffen, Marc Zyngier; +Cc: devel, dann.frazier, kraxel
(cc Marc)
Context:
- on my TX2 (with the S1PTW r/o memslot fix applied), the new version
of ArmVirtQemu that uses an initial ID map in emulated NOR flash works
fine.
- in Oliver's case (which is a slightly different flavor of TX2), it
crashes extremely early, presumably at the point where this ID map is
activated.
More details at the end.
On Thu, 19 Jan 2023 at 12:03, Oliver Steffen <osteffen@redhat.com> wrote:
>
> Quoting Ard Biesheuvel (2023-01-18 10:22:12)
> > On Wed, 18 Jan 2023 at 09:48, Ard Biesheuvel <ardb@kernel.org> wrote:
> > >
> > > On Wed, 18 Jan 2023 at 09:28, Oliver Steffen <osteffen@redhat.com> wrote:
> > > >
> > > > Quoting Ard Biesheuvel (2023-01-18 08:34:32)
> > > > > On Wed, 18 Jan 2023 at 07:37, Oliver Steffen <osteffen@redhat.com> wrote:
> > > > > >
> > > > > > On Tue, Jan 17, 2023 at 3:57 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> > > > > >>
> > > > > >> On Tue, 17 Jan 2023 at 13:48, Oliver Steffen <osteffen@redhat.com> wrote:
> > > > > >> >
> > > > > >> > Hi Ard, Hi everyone,
> > > > > >> >
> > > > > >> > Thanks for the work!
> > > > > >> >
> > > > > >> > But somehow this patch (as it was merged into master branch) does not
> > > > > >> > work for me on the ThunderX box we have.
> > > > > >> >
> > > > > >> > Any idea what could be wrong?
> > > > > >>
> > > > > >> I'm not sure I understand the question. The patch targets ThunderX,
> > > > > >> and you are using a ThunderX2.
> > > > > >>
> > > > > >> What were you expecting to happen, and what is happening instead?
> > > > > >
> > > > > >
> > > > > > Firmware does not start at all when using KVM.
> > > > > >
> > > > > > Please excuse my limited knowledge of Arm processor variants.
> > > > > > I assumed that ThunderX and ThunderX2 are very similar and hoped
> > > > > > the fix would also work for this case.
> > > > > >
> > > > > > The issue was introduced by the same commit that Dann
> > > > > > reported (07be1d34d95460a238fcd0f6693efb747c28b329):
> > > > > > "ArmVirtPkg/ArmVirtQemu: enable initial ID map at early boot".
> > > > > >
> > > > >
> > > > > Can you share the QEMU command line that you are using? I use a
> > > > > ThunderX2 basically 24/7 to do all my Linux and EDK2 development, so
> > > > > this change was developed on ThunderX2 and so I'm surprised you are
> > > > > seeing this issue.
> > > > >
> > > > > Did you try the DEBUG build as well?
> > > > Yes, debug is on.
> > > >
> > > > Here is what I have, trying with the master branch from just now
> > > > (998ebe5ca0ae5c449e83ede533bee872f97d63af):
> > > >
> > > > # make -C BaseTools && \
> > > > . ./edksetup.sh && \
> > > > build -t GCC5 -a AARCH64 \
> > > > -p ArmVirtPkg/ArmVirtQemu.dsc \
> > > > -DCAVIUM_ERRATUM_27456 \
> > > > -b DEBUG
> > > >
> > > > # /usr/libexec/qemu-kvm \
> > > > -machine accel=kvm -m 1G -boot menu=on \
> > > > -blockdev node-name=code,driver=file,filename="${FW_CODE_RESIZED}",read-only=on
> > > > \
> > > > -blockdev node-name=vars,driver=file,filename="${FW_VARS}" \
> > > > -machine pflash0=code \
> > > > -machine pflash1=vars \
> > > > -cpu max \
> > > > -net none \
> > > > -serial stdio
> > > >
> > >
> > > My distro does not have qemu-kvm, and using the command line above
> > > results in the following if i try it with qemu-system-aarch64
> > >
> > > """
> > > qemu-system-aarch64: No machine specified, and there is no default
> > > Use -machine help to list supported machines
> > > """
> > >
> > > unless i change it to
> > >
> > > qemu-system-aarch64 -machine virt,accel=kvm -m 1G -boot menu=on \
> > > -blockdev node-name=code,driver=file,filename=$HOME/bin/flash0.img,read-only=on
> > > \
> > > -blockdev node-name=vars,driver=file,filename=$HOME/bin/flash1.img \
> > > -machine pflash0=code \
> > > -machine pflash1=vars \
> > > -cpu max \
> > > -net none \
> > > -nographic
> > >
> > > and that works fine with my firmware build.
> > >
> > >
> > > > # /usr/libexec/qemu-kvm --version
> > > > QEMU emulator version 7.2.0 (qemu-kvm-7.2.0-3.el9)
> > > >
> > > > # uname -r
> > > > 5.14.0-234.el9.aarch64
> > > >
> > >
> > > Yeah, that is quite old. One potential issue that comes to mind here
> > > is the one address by the patch below
> > >
> > >
> > > >
> > > >
> > > > Since you have the same CPU... Might this be a bug in KVM?
> > > >
> > >
> > > Indeed. Could you try applying this patch?
> > >
> > > commit 406504c7b0405d74d74c15a667cd4c4620c3e7a9
> > > Author: Marc Zyngier <maz@kernel.org>
> > > Date: Tue Dec 20 14:03:52 2022 +0000
> > >
> > > KVM: arm64: Fix S1PTW handling on RO memslots
> > >
> > > Or check whether this is generally reproducible with newer kernels?
> >
> > Another thing you might try:
> >
> > - build the firmware with the following hunk applied
> >
> > """
> > diff --git a/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> > b/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> > index 5ac7c732f6ec..f4e1285beefc 100644
> > --- a/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> > +++ b/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> > @@ -40,6 +40,12 @@
> > .set sctlrval, SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA |
> > SCTLR_EL1_ITD | SCTLR_EL1_SED
> > .set sctlrval, sctlrval | SCTLR_ELx_I | SCTLR_EL1_SPAN | SCTLR_EL1_RES1
> >
> > + .align 11
> > +.Lvectors:
> > + .rept 16
> > + .align 7
> > + b .
> > + .endr
> >
> > ASM_FUNC(ArmPlatformPeiBootAction)
> > #ifdef CAVIUM_ERRATUM_27456
> > @@ -90,6 +96,8 @@ ASM_FUNC(ArmPlatformPeiBootAction)
> > msr mair_el1, x0 // set up the 1:1 mapping
> > msr tcr_el1, x1
> > msr ttbr0_el1, x2
> > + adr x0, .Lvectors
> > + msr vbar_el1, x0
> > isb
> >
> > tlbi vmalle1 // invalidate any cached translations
> > """
> >
> > - run qemu with the -s option and let it crash
> >
> > - connect with gdb and dump the exception context
> >
> > target remote:1234
> > set radix 16
> > p $FAR_EL1
> > p $ESR_EL1
> > p $ELR_EL1
> >
> > That should at least tell us why the crash is occurring.
> >
>
> I tried the most recent Qemu master (v7.2.50) and also v7.0.0,
> on the 5.14 (RHEL) kernel and on 6.1.6-200.fc37.aarch64 (from Fedora).
> No luck.
>
Does that include a backport of commit 406504c7b0405d74d74c15a667cd4c4620c3e7a9?
> I applied the patch and attached gdb, as described (Qemu 7.2.50):
>
> p $ELR_EL1
> (gdb) p $FAR_EL1
> $1 = 0x6200
> (gdb) p $ESR_EL1
> $2 = 0x86000010
> (gdb) p $ELR_EL1
> $3 = 0x6200
>
> There is no sign of any crash. It seems like it does not even start
> running.
>
So 0x6200 is the sync exception vector, which is both the code
location of the crash and the faulting address. This means fetching
the instructions to handle the original exception failed, and so the
original exception reason (ESR) is lost. However, the synchronous
external abort (https://esr.arm64.dev/?#0x86000010) that you are
seeing might point to an issue similar (or the same) that Marc
recently fixed in KVM.
It is quite odd that this does not reproduce *at all* on my TX2.
Fedora kernels don't use 64k pages right?
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-19 11:11 ` Ard Biesheuvel
@ 2023-01-19 11:25 ` Oliver Steffen
2023-01-19 11:55 ` Marc Zyngier
2023-01-19 12:00 ` Gerd Hoffmann
2 siblings, 0 replies; 37+ messages in thread
From: Oliver Steffen @ 2023-01-19 11:25 UTC (permalink / raw)
To: Ard Biesheuvel, Marc Zyngier; +Cc: devel, dann.frazier, kraxel
Quoting Ard Biesheuvel (2023-01-19 12:11:34)
> (cc Marc)
>
> Context:
> - on my TX2 (with the S1PTW r/o memslot fix applied), the new version
> of ArmVirtQemu that uses an initial ID map in emulated NOR flash works
> fine.
> - in Oliver's case (which is a slightly different flavor of TX2), it
> crashes extremely early, presumably at the point where this ID map is
> activated.
>
> More details at the end.
>
> On Thu, 19 Jan 2023 at 12:03, Oliver Steffen <osteffen@redhat.com> wrote:
> >
> > Quoting Ard Biesheuvel (2023-01-18 10:22:12)
> > > On Wed, 18 Jan 2023 at 09:48, Ard Biesheuvel <ardb@kernel.org> wrote:
> > > >
> > > > On Wed, 18 Jan 2023 at 09:28, Oliver Steffen <osteffen@redhat.com> wrote:
> > > > >
> > > > > Quoting Ard Biesheuvel (2023-01-18 08:34:32)
> > > > > > On Wed, 18 Jan 2023 at 07:37, Oliver Steffen <osteffen@redhat.com> wrote:
> > > > > > >
> > > > > > > On Tue, Jan 17, 2023 at 3:57 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> > > > > > >>
> > > > > > >> On Tue, 17 Jan 2023 at 13:48, Oliver Steffen <osteffen@redhat.com> wrote:
> > > > > > >> >
> > > > > > >> > Hi Ard, Hi everyone,
> > > > > > >> >
> > > > > > >> > Thanks for the work!
> > > > > > >> >
> > > > > > >> > But somehow this patch (as it was merged into master branch) does not
> > > > > > >> > work for me on the ThunderX box we have.
> > > > > > >> >
> > > > > > >> > Any idea what could be wrong?
> > > > > > >>
> > > > > > >> I'm not sure I understand the question. The patch targets ThunderX,
> > > > > > >> and you are using a ThunderX2.
> > > > > > >>
> > > > > > >> What were you expecting to happen, and what is happening instead?
> > > > > > >
> > > > > > >
> > > > > > > Firmware does not start at all when using KVM.
> > > > > > >
> > > > > > > Please excuse my limited knowledge of Arm processor variants.
> > > > > > > I assumed that ThunderX and ThunderX2 are very similar and hoped
> > > > > > > the fix would also work for this case.
> > > > > > >
> > > > > > > The issue was introduced by the same commit that Dann
> > > > > > > reported (07be1d34d95460a238fcd0f6693efb747c28b329):
> > > > > > > "ArmVirtPkg/ArmVirtQemu: enable initial ID map at early boot".
> > > > > > >
> > > > > >
> > > > > > Can you share the QEMU command line that you are using? I use a
> > > > > > ThunderX2 basically 24/7 to do all my Linux and EDK2 development, so
> > > > > > this change was developed on ThunderX2 and so I'm surprised you are
> > > > > > seeing this issue.
> > > > > >
> > > > > > Did you try the DEBUG build as well?
> > > > > Yes, debug is on.
> > > > >
> > > > > Here is what I have, trying with the master branch from just now
> > > > > (998ebe5ca0ae5c449e83ede533bee872f97d63af):
> > > > >
> > > > > # make -C BaseTools && \
> > > > > . ./edksetup.sh && \
> > > > > build -t GCC5 -a AARCH64 \
> > > > > -p ArmVirtPkg/ArmVirtQemu.dsc \
> > > > > -DCAVIUM_ERRATUM_27456 \
> > > > > -b DEBUG
> > > > >
> > > > > # /usr/libexec/qemu-kvm \
> > > > > -machine accel=kvm -m 1G -boot menu=on \
> > > > > -blockdev node-name=code,driver=file,filename="${FW_CODE_RESIZED}",read-only=on
> > > > > \
> > > > > -blockdev node-name=vars,driver=file,filename="${FW_VARS}" \
> > > > > -machine pflash0=code \
> > > > > -machine pflash1=vars \
> > > > > -cpu max \
> > > > > -net none \
> > > > > -serial stdio
> > > > >
> > > >
> > > > My distro does not have qemu-kvm, and using the command line above
> > > > results in the following if i try it with qemu-system-aarch64
> > > >
> > > > """
> > > > qemu-system-aarch64: No machine specified, and there is no default
> > > > Use -machine help to list supported machines
> > > > """
> > > >
> > > > unless i change it to
> > > >
> > > > qemu-system-aarch64 -machine virt,accel=kvm -m 1G -boot menu=on \
> > > > -blockdev node-name=code,driver=file,filename=$HOME/bin/flash0.img,read-only=on
> > > > \
> > > > -blockdev node-name=vars,driver=file,filename=$HOME/bin/flash1.img \
> > > > -machine pflash0=code \
> > > > -machine pflash1=vars \
> > > > -cpu max \
> > > > -net none \
> > > > -nographic
> > > >
> > > > and that works fine with my firmware build.
> > > >
> > > >
> > > > > # /usr/libexec/qemu-kvm --version
> > > > > QEMU emulator version 7.2.0 (qemu-kvm-7.2.0-3.el9)
> > > > >
> > > > > # uname -r
> > > > > 5.14.0-234.el9.aarch64
> > > > >
> > > >
> > > > Yeah, that is quite old. One potential issue that comes to mind here
> > > > is the one address by the patch below
> > > >
> > > >
> > > > >
> > > > >
> > > > > Since you have the same CPU... Might this be a bug in KVM?
> > > > >
> > > >
> > > > Indeed. Could you try applying this patch?
> > > >
> > > > commit 406504c7b0405d74d74c15a667cd4c4620c3e7a9
> > > > Author: Marc Zyngier <maz@kernel.org>
> > > > Date: Tue Dec 20 14:03:52 2022 +0000
> > > >
> > > > KVM: arm64: Fix S1PTW handling on RO memslots
> > > >
> > > > Or check whether this is generally reproducible with newer kernels?
> > >
> > > Another thing you might try:
> > >
> > > - build the firmware with the following hunk applied
> > >
> > > """
> > > diff --git a/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> > > b/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> > > index 5ac7c732f6ec..f4e1285beefc 100644
> > > --- a/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> > > +++ b/ArmVirtPkg/Library/ArmPlatformLibQemu/AArch64/ArmPlatformHelper.S
> > > @@ -40,6 +40,12 @@
> > > .set sctlrval, SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA |
> > > SCTLR_EL1_ITD | SCTLR_EL1_SED
> > > .set sctlrval, sctlrval | SCTLR_ELx_I | SCTLR_EL1_SPAN | SCTLR_EL1_RES1
> > >
> > > + .align 11
> > > +.Lvectors:
> > > + .rept 16
> > > + .align 7
> > > + b .
> > > + .endr
> > >
> > > ASM_FUNC(ArmPlatformPeiBootAction)
> > > #ifdef CAVIUM_ERRATUM_27456
> > > @@ -90,6 +96,8 @@ ASM_FUNC(ArmPlatformPeiBootAction)
> > > msr mair_el1, x0 // set up the 1:1 mapping
> > > msr tcr_el1, x1
> > > msr ttbr0_el1, x2
> > > + adr x0, .Lvectors
> > > + msr vbar_el1, x0
> > > isb
> > >
> > > tlbi vmalle1 // invalidate any cached translations
> > > """
> > >
> > > - run qemu with the -s option and let it crash
> > >
> > > - connect with gdb and dump the exception context
> > >
> > > target remote:1234
> > > set radix 16
> > > p $FAR_EL1
> > > p $ESR_EL1
> > > p $ELR_EL1
> > >
> > > That should at least tell us why the crash is occurring.
> > >
> >
> > I tried the most recent Qemu master (v7.2.50) and also v7.0.0,
> > on the 5.14 (RHEL) kernel and on 6.1.6-200.fc37.aarch64 (from Fedora).
> > No luck.
> >
>
> Does that include a backport of commit 406504c7b0405d74d74c15a667cd4c4620c3e7a9?
>
> > I applied the patch and attached gdb, as described (Qemu 7.2.50):
> >
> > p $ELR_EL1
> > (gdb) p $FAR_EL1
> > $1 = 0x6200
> > (gdb) p $ESR_EL1
> > $2 = 0x86000010
> > (gdb) p $ELR_EL1
> > $3 = 0x6200
> >
> > There is no sign of any crash. It seems like it does not even start
> > running.
> >
>
> So 0x6200 is the sync exception vector, which is both the code
> location of the crash and the faulting address. This means fetching
> the instructions to handle the original exception failed, and so the
> original exception reason (ESR) is lost. However, the synchronous
> external abort (https://esr.arm64.dev/?#0x86000010) that you are
> seeing might point to an issue similar (or the same) that Marc
> recently fixed in KVM.
>
> It is quite odd that this does not reproduce *at all* on my TX2.
> Fedora kernels don't use 64k pages right?
>
Kernel config says:
CONFIG_ARM64_4K_PAGES=y
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-19 11:11 ` Ard Biesheuvel
2023-01-19 11:25 ` Oliver Steffen
@ 2023-01-19 11:55 ` Marc Zyngier
2023-01-19 12:21 ` Ard Biesheuvel
2023-01-19 12:00 ` Gerd Hoffmann
2 siblings, 1 reply; 37+ messages in thread
From: Marc Zyngier @ 2023-01-19 11:55 UTC (permalink / raw)
To: Ard Biesheuvel, Oliver Steffen; +Cc: devel, dann.frazier, kraxel
On Thu, 19 Jan 2023 11:11:34 +0000,
Ard Biesheuvel <ardb@kernel.org> wrote:
>
> (cc Marc)
>
> Context:
> - on my TX2 (with the S1PTW r/o memslot fix applied), the new version
> of ArmVirtQemu that uses an initial ID map in emulated NOR flash works
> fine.
> - in Oliver's case (which is a slightly different flavor of TX2), it
> crashes extremely early, presumably at the point where this ID map is
> activated.
Oliver seems to have a vintage ThunderX (aka the worst arm64
implementation in history!), so it is indeed a very different beat
from TX2.
Without the kernel patch[1], I can trigger the issue pretty reliably,
specially in the absence of THP. It all depends on the layout of the
EDK2 object and the order in which pages get mapped.
The first course of action would be to make sure that the patch is
applied to the host kernel. If this still fails to boot, I'm happy to
help investigating it.
M.
[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/patch?id=406504c7b0405d74d74c15a667cd4c4620c3e7a9
--
Without deviation from the norm, progress is not possible.
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-19 11:11 ` Ard Biesheuvel
2023-01-19 11:25 ` Oliver Steffen
2023-01-19 11:55 ` Marc Zyngier
@ 2023-01-19 12:00 ` Gerd Hoffmann
2023-01-19 12:55 ` Oliver Steffen
2 siblings, 1 reply; 37+ messages in thread
From: Gerd Hoffmann @ 2023-01-19 12:00 UTC (permalink / raw)
To: devel, ardb; +Cc: Oliver Steffen, Marc Zyngier, dann.frazier
Hi,
> > I tried the most recent Qemu master (v7.2.50) and also v7.0.0,
> > on the 5.14 (RHEL) kernel and on 6.1.6-200.fc37.aarch64 (from Fedora).
> > No luck.
> >
>
> Does that include a backport of commit 406504c7b0405d74d74c15a667cd4c4620c3e7a9?
Probably not given that fedora kernels are almost vanilla and the
backport landed in stable in v6.1.7 (commit 9a1195c584321).
Oliver: try pulling the latest kernel directly from koji:
https://koji.fedoraproject.org/koji/buildinfo?buildID=2112315
take care,
Gerd
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-19 11:55 ` Marc Zyngier
@ 2023-01-19 12:21 ` Ard Biesheuvel
0 siblings, 0 replies; 37+ messages in thread
From: Ard Biesheuvel @ 2023-01-19 12:21 UTC (permalink / raw)
To: Marc Zyngier; +Cc: Oliver Steffen, devel, dann.frazier, kraxel
On Thu, 19 Jan 2023 at 12:56, Marc Zyngier <maz@kernel.org> wrote:
>
> On Thu, 19 Jan 2023 11:11:34 +0000,
> Ard Biesheuvel <ardb@kernel.org> wrote:
> >
> > (cc Marc)
> >
> > Context:
> > - on my TX2 (with the S1PTW r/o memslot fix applied), the new version
> > of ArmVirtQemu that uses an initial ID map in emulated NOR flash works
> > fine.
> > - in Oliver's case (which is a slightly different flavor of TX2), it
> > crashes extremely early, presumably at the point where this ID map is
> > activated.
>
> Oliver seems to have a vintage ThunderX (aka the worst arm64
> implementation in history!), so it is indeed a very different beat
> from TX2.
>
No, it's a TX2 alright - we disabled this ID map for TX1 already
because it tickles the I-cache corruption erratum (EDK2 runs at EL1
and the initial ID map is ASID scoped with non-global mappings, so
that we can switch to the actual ID map seamlessly)
> Without the kernel patch[1], I can trigger the issue pretty reliably,
> specially in the absence of THP. It all depends on the layout of the
> EDK2 object and the order in which pages get mapped.
>
> The first course of action would be to make sure that the patch is
> applied to the host kernel. If this still fails to boot, I'm happy to
> help investigating it.
>
indeed.
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-19 12:00 ` Gerd Hoffmann
@ 2023-01-19 12:55 ` Oliver Steffen
2023-01-19 13:21 ` Ard Biesheuvel
0 siblings, 1 reply; 37+ messages in thread
From: Oliver Steffen @ 2023-01-19 12:55 UTC (permalink / raw)
To: Gerd Hoffmann, ardb, devel; +Cc: Marc Zyngier, dann.frazier
Quoting Gerd Hoffmann (2023-01-19 13:00:21)
> Hi,
>
> > > I tried the most recent Qemu master (v7.2.50) and also v7.0.0,
> > > on the 5.14 (RHEL) kernel and on 6.1.6-200.fc37.aarch64 (from Fedora).
> > > No luck.
> > >
> >
> > Does that include a backport of commit 406504c7b0405d74d74c15a667cd4c4620c3e7a9?
>
> Probably not given that fedora kernels are almost vanilla and the
> backport landed in stable in v6.1.7 (commit 9a1195c584321).
>
> Oliver: try pulling the latest kernel directly from koji:
> https://koji.fedoraproject.org/koji/buildinfo?buildID=2112315
>
> take care,
> Gerd
>
Thanks for the koji link, Gerd.
It works with 6.1.7-200.fc37.aarch64.
Perfect.
Thanks Ard, thanks Marc!
-Oliver
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-19 12:55 ` Oliver Steffen
@ 2023-01-19 13:21 ` Ard Biesheuvel
2023-01-26 12:01 ` Gerd Hoffmann
2023-02-01 9:13 ` Oliver Steffen
0 siblings, 2 replies; 37+ messages in thread
From: Ard Biesheuvel @ 2023-01-19 13:21 UTC (permalink / raw)
To: devel, osteffen; +Cc: Gerd Hoffmann, Marc Zyngier, dann.frazier
On Thu, 19 Jan 2023 at 13:55, Oliver Steffen <osteffen@redhat.com> wrote:
>
> Quoting Gerd Hoffmann (2023-01-19 13:00:21)
> > Hi,
> >
> > > > I tried the most recent Qemu master (v7.2.50) and also v7.0.0,
> > > > on the 5.14 (RHEL) kernel and on 6.1.6-200.fc37.aarch64 (from Fedora).
> > > > No luck.
> > > >
> > >
> > > Does that include a backport of commit 406504c7b0405d74d74c15a667cd4c4620c3e7a9?
> >
> > Probably not given that fedora kernels are almost vanilla and the
> > backport landed in stable in v6.1.7 (commit 9a1195c584321).
> >
> > Oliver: try pulling the latest kernel directly from koji:
> > https://koji.fedoraproject.org/koji/buildinfo?buildID=2112315
> >
> > take care,
> > Gerd
> >
>
> Thanks for the koji link, Gerd.
>
> It works with 6.1.7-200.fc37.aarch64.
> Perfect.
>
> Thanks Ard, thanks Marc!
>
Thanks for the report. Up to this point, Marc is the only one who had
managed to reproduce this afaik, so having another data point where
the fix works as intended is rather nice.
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-19 13:21 ` Ard Biesheuvel
@ 2023-01-26 12:01 ` Gerd Hoffmann
2023-01-26 13:48 ` Marc Zyngier
2023-02-01 9:13 ` Oliver Steffen
1 sibling, 1 reply; 37+ messages in thread
From: Gerd Hoffmann @ 2023-01-26 12:01 UTC (permalink / raw)
To: Ard Biesheuvel; +Cc: devel, osteffen, Marc Zyngier, dann.frazier
On Thu, Jan 19, 2023 at 02:21:30PM +0100, Ard Biesheuvel wrote:
> On Thu, 19 Jan 2023 at 13:55, Oliver Steffen <osteffen@redhat.com> wrote:
> >
> > Quoting Gerd Hoffmann (2023-01-19 13:00:21)
> > > Hi,
> > >
> > > > > I tried the most recent Qemu master (v7.2.50) and also v7.0.0,
> > > > > on the 5.14 (RHEL) kernel and on 6.1.6-200.fc37.aarch64 (from Fedora).
> > > > > No luck.
> > > > >
> > > >
> > > > Does that include a backport of commit 406504c7b0405d74d74c15a667cd4c4620c3e7a9?
> > >
> > > Probably not given that fedora kernels are almost vanilla and the
> > > backport landed in stable in v6.1.7 (commit 9a1195c584321).
> > >
> > > Oliver: try pulling the latest kernel directly from koji:
> > > https://koji.fedoraproject.org/koji/buildinfo?buildID=2112315
> > >
> > > take care,
> > > Gerd
> > >
> >
> > Thanks for the koji link, Gerd.
> >
> > It works with 6.1.7-200.fc37.aarch64.
> > Perfect.
> >
> > Thanks Ard, thanks Marc!
> >
>
> Thanks for the report. Up to this point, Marc is the only one who had
> managed to reproduce this afaik, so having another data point where
> the fix works as intended is rather nice.
One more data point: This also gets things working again on the
raspberry pi 3. Well, sort of. There still is some timer problem
(it waits forever on the splash screen). That might also be some
qemu bug though ...
take care,
Gerd
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-26 12:01 ` Gerd Hoffmann
@ 2023-01-26 13:48 ` Marc Zyngier
2023-01-26 14:46 ` Gerd Hoffmann
0 siblings, 1 reply; 37+ messages in thread
From: Marc Zyngier @ 2023-01-26 13:48 UTC (permalink / raw)
To: Gerd Hoffmann; +Cc: Ard Biesheuvel, devel, osteffen, dann.frazier
On Thu, 26 Jan 2023 12:01:51 +0000,
Gerd Hoffmann <kraxel@redhat.com> wrote:
>
> On Thu, Jan 19, 2023 at 02:21:30PM +0100, Ard Biesheuvel wrote:
> > On Thu, 19 Jan 2023 at 13:55, Oliver Steffen <osteffen@redhat.com> wrote:
> > >
> > > Quoting Gerd Hoffmann (2023-01-19 13:00:21)
> > > > Hi,
> > > >
> > > > > > I tried the most recent Qemu master (v7.2.50) and also v7.0.0,
> > > > > > on the 5.14 (RHEL) kernel and on 6.1.6-200.fc37.aarch64 (from Fedora).
> > > > > > No luck.
> > > > > >
> > > > >
> > > > > Does that include a backport of commit 406504c7b0405d74d74c15a667cd4c4620c3e7a9?
> > > >
> > > > Probably not given that fedora kernels are almost vanilla and the
> > > > backport landed in stable in v6.1.7 (commit 9a1195c584321).
> > > >
> > > > Oliver: try pulling the latest kernel directly from koji:
> > > > https://koji.fedoraproject.org/koji/buildinfo?buildID=2112315
> > > >
> > > > take care,
> > > > Gerd
> > > >
> > >
> > > Thanks for the koji link, Gerd.
> > >
> > > It works with 6.1.7-200.fc37.aarch64.
> > > Perfect.
> > >
> > > Thanks Ard, thanks Marc!
> > >
> >
> > Thanks for the report. Up to this point, Marc is the only one who had
> > managed to reproduce this afaik, so having another data point where
> > the fix works as intended is rather nice.
>
> One more data point: This also gets things working again on the
> raspberry pi 3. Well, sort of. There still is some timer problem
> (it waits forever on the splash screen). That might also be some
> qemu bug though ...
RPi3 as a host? Using the QEMU-provided GIC emulation? I wouldn't be
surprised if that broke ages ago as nobody really tests this
configuration. If you can pinpoint the last state where this actually
worked, I'll try and have a look.
But this sounds unrelated to the page fault handling that my patch was
fixing.
Thanks,
M.
--
Without deviation from the norm, progress is not possible.
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-26 13:48 ` Marc Zyngier
@ 2023-01-26 14:46 ` Gerd Hoffmann
2023-01-26 15:08 ` Marc Zyngier
0 siblings, 1 reply; 37+ messages in thread
From: Gerd Hoffmann @ 2023-01-26 14:46 UTC (permalink / raw)
To: Marc Zyngier; +Cc: Ard Biesheuvel, devel, osteffen, dann.frazier
On Thu, Jan 26, 2023 at 01:48:12PM +0000, Marc Zyngier wrote:
> On Thu, 26 Jan 2023 12:01:51 +0000,
> Gerd Hoffmann <kraxel@redhat.com> wrote:
> >
> > > > Thanks for the koji link, Gerd.
> > > >
> > > > It works with 6.1.7-200.fc37.aarch64.
> > > > Perfect.
> > > >
> > > > Thanks Ard, thanks Marc!
> > > >
> > >
> > > Thanks for the report. Up to this point, Marc is the only one who had
> > > managed to reproduce this afaik, so having another data point where
> > > the fix works as intended is rather nice.
> >
> > One more data point: This also gets things working again on the
> > raspberry pi 3. Well, sort of. There still is some timer problem
> > (it waits forever on the splash screen). That might also be some
> > qemu bug though ...
>
> RPi3 as a host? Using the QEMU-provided GIC emulation?
Yes.
> I wouldn't be surprised if that broke ages ago as nobody really tests
> this configuration.
Understandable. 1G is enough for simple smoke tests but not much
beyond that. And it is the odd child in the gang which hasn't a gic.
> If you can pinpoint the last state where this actually worked, I'll
> try and have a look.
Don't have that at hand. Need try downgrade stuff and see what I find.
Tried with edk2 already back a year or so without success. Didn't try
downgrading qemu / kernel yet.
> But this sounds unrelated to the page fault handling that my patch was
> fixing.
Yes, unrelated bug. But the page fault bug reproduces on the rpi3 too,
without the fix the firmware hangs before printing the first log line.
Wanted report that because Ard mentioned it is hard to reproduce.
take care,
Gerd
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-26 14:46 ` Gerd Hoffmann
@ 2023-01-26 15:08 ` Marc Zyngier
0 siblings, 0 replies; 37+ messages in thread
From: Marc Zyngier @ 2023-01-26 15:08 UTC (permalink / raw)
To: Gerd Hoffmann; +Cc: Ard Biesheuvel, devel, osteffen, dann.frazier
On Thu, 26 Jan 2023 14:46:13 +0000,
Gerd Hoffmann <kraxel@redhat.com> wrote:
>
> On Thu, Jan 26, 2023 at 01:48:12PM +0000, Marc Zyngier wrote:
> > On Thu, 26 Jan 2023 12:01:51 +0000,
> > Gerd Hoffmann <kraxel@redhat.com> wrote:
> > >
> > > > > Thanks for the koji link, Gerd.
> > > > >
> > > > > It works with 6.1.7-200.fc37.aarch64.
> > > > > Perfect.
> > > > >
> > > > > Thanks Ard, thanks Marc!
> > > > >
> > > >
> > > > Thanks for the report. Up to this point, Marc is the only one who had
> > > > managed to reproduce this afaik, so having another data point where
> > > > the fix works as intended is rather nice.
> > >
> > > One more data point: This also gets things working again on the
> > > raspberry pi 3. Well, sort of. There still is some timer problem
> > > (it waits forever on the splash screen). That might also be some
> > > qemu bug though ...
> >
> > RPi3 as a host? Using the QEMU-provided GIC emulation?
>
> Yes.
>
> > I wouldn't be surprised if that broke ages ago as nobody really tests
> > this configuration.
>
> Understandable. 1G is enough for simple smoke tests but not much
> beyond that. And it is the odd child in the gang which hasn't a gic.
I suspect that it is that last point that has broken. The kernel
handling of that configuration has always been super sketchy, and it
would take almost nothing for that to break.
I'm almost thinking of writing an in-kernel GICv2 emulation just to
get rid of the rest of the hacks that have proliferated in the timer
and PMU code. But is it worth it for such an odd machine?
The alternative would be to move on and stop supporting KVM on this
wonky setup. :-/
> > If you can pinpoint the last state where this actually worked, I'll
> > try and have a look.
>
> Don't have that at hand. Need try downgrade stuff and see what I find.
> Tried with edk2 already back a year or so without success. Didn't try
> downgrading qemu / kernel yet.
Right. Let me know if you get somewhere with that at some point.
> > But this sounds unrelated to the page fault handling that my patch was
> > fixing.
>
> Yes, unrelated bug. But the page fault bug reproduces on the rpi3 too,
> without the fix the firmware hangs before printing the first log line.
> Wanted report that because Ard mentioned it is hard to reproduce.
Right, thanks for that. Very interesting data point indeed.
Cheers,
M.
--
Without deviation from the norm, progress is not possible.
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-01-19 13:21 ` Ard Biesheuvel
2023-01-26 12:01 ` Gerd Hoffmann
@ 2023-02-01 9:13 ` Oliver Steffen
2023-02-01 11:51 ` Ard Biesheuvel
1 sibling, 1 reply; 37+ messages in thread
From: Oliver Steffen @ 2023-02-01 9:13 UTC (permalink / raw)
To: Ard Biesheuvel; +Cc: devel, Gerd Hoffmann, Marc Zyngier, dann.frazier
[-- Attachment #1: Type: text/plain, Size: 6877 bytes --]
Hi everyone!
On Thu, Jan 19, 2023 at 2:21 PM Ard Biesheuvel <ardb@kernel.org> wrote:
>
> On Thu, 19 Jan 2023 at 13:55, Oliver Steffen <osteffen@redhat.com> wrote:
> >
> > Quoting Gerd Hoffmann (2023-01-19 13:00:21)
> > > Hi,
> > >
> > > > > I tried the most recent Qemu master (v7.2.50) and also v7.0.0,
> > > > > on the 5.14 (RHEL) kernel and on 6.1.6-200.fc37.aarch64 (from
Fedora).
> > > > > No luck.
> > > > >
> > > >
> > > > Does that include a backport of commit
406504c7b0405d74d74c15a667cd4c4620c3e7a9?
> > >
> > > Probably not given that fedora kernels are almost vanilla and the
> > > backport landed in stable in v6.1.7 (commit 9a1195c584321).
> > >
> > > Oliver: try pulling the latest kernel directly from koji:
> > > https://koji.fedoraproject.org/koji/buildinfo?buildID=2112315
> > >
> > > take care,
> > > Gerd
> > >
> >
> > Thanks for the koji link, Gerd.
> >
> > It works with 6.1.7-200.fc37.aarch64.
> > Perfect.
> >
> > Thanks Ard, thanks Marc!
> >
>
> Thanks for the report. Up to this point, Marc is the only one who had
> managed to reproduce this afaik, so having another data point where
> the fix works as intended is rather nice.
>
I am sorry, this story does not seem to be over yet.
We are using the Erratum patch and also included the commit 406504c7 in
the kernel.
Now the firmware crashes sometimes (10 out of 89 tests).
Any hints are very welcome!
Here is the serial output of one case:
----------------------------------------------------------------
UEFI firmware (version edk2-20221207gitfff6d81270b5-4.el9.test built at
00:00:00 on Jan 18 2023)
SyncPcrAllocationsAndPcrMask!
Synchronous Exception at 0x000000037FD5BDE0
PC 0x00037FD5BDE0 (0x00037FD59000+0x00002DE0) [ 0] ArmCpuDxe.dll
PC 0x00037FD5BDE0 (0x00037FD59000+0x00002DE0) [ 0] ArmCpuDxe.dll
PC 0x00037FD5BCE0 (0x00037FD59000+0x00002CE0) [ 0] ArmCpuDxe.dll
PC 0x00037FD5C054 (0x00037FD59000+0x00003054) [ 0] ArmCpuDxe.dll
PC 0x0000476F08EC (0x0000476EE000+0x000028EC) [ 1] DxeCore.dll
PC 0x0000476F65C0 (0x0000476EE000+0x000085C0) [ 1] DxeCore.dll
PC 0x0000476FB5B8 (0x0000476EE000+0x0000D5B8) [ 1] DxeCore.dll
PC 0x0000476F5DB8 (0x0000476EE000+0x00007DB8) [ 1] DxeCore.dll
PC 0x00037FD5CD50 (0x00037FD59000+0x00003D50) [ 2] ArmCpuDxe.dll
PC 0x0000476F5758 (0x0000476EE000+0x00007758) [ 3] DxeCore.dll
PC 0x000047702B18 (0x0000476EE000+0x00014B18) [ 3] DxeCore.dll
PC 0x0000476F8D70 (0x0000476EE000+0x0000AD70) [ 3] DxeCore.dll
[ 0]
/builddir/build/BUILD/edk2-fff6d81270b5/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/ArmPkg/Drivers/CpuDxe/CpuDxe/DEBUG/ArmCpuDxe.dll
[ 1]
/builddir/build/BUILD/edk2-fff6d81270b5/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/MdeModulePkg/Core/Dxe/DxeMain/DEBUG/DxeCore.dll
[ 2]
/builddir/build/BUILD/edk2-fff6d81270b5/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/ArmPkg/Drivers/CpuDxe/CpuDxe/DEBUG/ArmCpuDxe.dll
[ 3]
/builddir/build/BUILD/edk2-fff6d81270b5/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/MdeModulePkg/Core/Dxe/DxeMain/DEBUG/DxeCore.dll
X0 0x0000000047FFE068 X1 0x000000037F179003 X2 0x0000000000340000
X3 0x0000000000000000
X4 0x0000000000000200 X5 0x0000000000000004 X6 0x0060000000000000
X7 0xFF9F000000000F3F
X8 0x000000037FFFF008 X9 0x0000000400000000 X10 0x000000037F177000
X11 0x000000037FA37FFF
X12 0x0000000000000000 X13 0x0000000000000008 X14 0x0000000000000000
X15 0x0000000000000000
X16 0x000000037FD5A208 X17 0x00000000007979D0 X18 0x0000000000000000
X19 0x0000000340000000
X20 0x0000000000000001 X21 0x000000037F179003 X22 0x0000000047FFE000
X23 0x0000000000000068
X24 0x000000003FFFFFFF X25 0x000000037CAA0000 X26 0x0000000000000002
X27 0x000000037F179000
X28 0x0000000047FFE068 FP 0x00000000476ED840 LR 0x000000037FD5BDE0
V0 0x0000000000000000 0000000000000000 V1 0x0000000000000000
0000000000000000
V2 0x0000000000000000 0000000000000000 V3 0x0000000000000000
0000000000000000
V4 0x0000000000000000 0000000000000000 V5 0x0000000000000000
0000000000000000
V6 0x0000000000000000 0000000000000000 V7 0x0000000000000000
0000000000000000
V8 0x0000000000000000 0000000000000000 V9 0x0000000000000000
0000000000000000
V10 0x0000000000000000 0000000000000000 V11 0x0000000000000000
0000000000000000
V12 0x0000000000000000 0000000000000000 V13 0x0000000000000000
0000000000000000
V14 0x0000000000000000 0000000000000000 V15 0x0000000000000000
0000000000000000
V16 0x0000000000000000 0000000000000000 V17 0x0000000000000000
0000000000000000
V18 0x0000000000000000 0000000000000000 V19 0x0000000000000000
0000000000000000
V20 0x0000000000000000 0000000000000000 V21 0x0000000000000000
0000000000000000
V22 0x0000000000000000 0000000000000000 V23 0x0000000000000000
0000000000000000
V24 0x0000000000000000 0000000000000000 V25 0x0000000000000000
0000000000000000
V26 0x0000000000000000 0000000000000000 V27 0x0000000000000000
0000000000000000
V28 0x0000000000000000 0000000000000000 V29 0x0000000000000000
0000000000000000
V30 0x0000000000000000 0000000000000000 V31 0x0000000000000000
0000000000000000
SP 0x00000000476ED840 ELR 0x000000037FD5BDE0 SPSR 0x80000205 FPSR
0x00000000
ESR 0x86000005 FAR 0x000000037FD5BDE0
ESR : EC 0x21 IL 0x1 ISS 0x00000005
Instruction abort: Translation fault, first level
Stack dump:
00000476ED740: 000000037CAA0000 000000037CAA0000 0060000000000000
FF9F000000000F3F
00000476ED760: 00000000476ED790 000000037FD610D8 0060000000000003
0000000C00000001
00000476ED780: 000000037CA0070D 000000037F179000 00000000476ED840
000000037FD5BCE0
00000476ED7A0: 0000000340000000 0000000000000001 000000037F179000
0000000047FFE000
00000476ED7C0: 0000000000000068 000000003FFFFFFF 000000037CAA0000
0000000000000002
00000476ED7E0: 000000037F179000 0000000047FFE068 000000037CC00000
000000037CAA0000
00000476ED800: 0060000000000000 FF9F000000000F3F 00000000476ED840
000000037FD610D8
00000476ED820: 0060000000000001 0000001500000001 000000034000070D
000000037F177000
> 00000476ED840: 00000000476ED8F0 000000037FD5BCE0 0000000047FFA000
0000000000000000
00000476ED860: 0000000047FFE000 0000000047FFF000 0000000000000000
0000007FFFFFFFFF
00000476ED880: 000000037CAA0000 0000000000000001 0000000047717588
0000000047FFF000
00000476ED8A0: 0000000380000000 000000037CAA0000 0060000000000000
FF9F000000000F3F
00000476ED8C0: 000000017FD605DD 000000037FD610D8 0060000000000001
0000001E00000001
00000476ED8E0: 0060000000000000 000000037F179000 00000000476ED9A0
000000037FD5C054
00000476ED900: 002000000000041C 0000000000000000 0000000047FFA000
0000000000004000
00000476ED920: 0000000334AA6000 0000000047FFF000 000000037F17A238
0000000047717000
ASSERT [ArmCpuDxe]
/builddir/build/BUILD/edk2-fff6d81270b5/ArmPkg/Library/DefaultExceptionHandlerLib/AArch64/DefaultExceptionHandler.c(333):
((BOOLEAN)(0==1))
----------------------------------------------------------------
Thanks,
Oliver
[-- Attachment #2: Type: text/html, Size: 8869 bytes --]
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-02-01 9:13 ` Oliver Steffen
@ 2023-02-01 11:51 ` Ard Biesheuvel
2023-02-01 12:58 ` Oliver Steffen
0 siblings, 1 reply; 37+ messages in thread
From: Ard Biesheuvel @ 2023-02-01 11:51 UTC (permalink / raw)
To: Oliver Steffen; +Cc: devel, Gerd Hoffmann, Marc Zyngier, dann.frazier
On Wed, 1 Feb 2023 at 10:14, Oliver Steffen <osteffen@redhat.com> wrote:
>
> Hi everyone!
>
> On Thu, Jan 19, 2023 at 2:21 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> >
> > On Thu, 19 Jan 2023 at 13:55, Oliver Steffen <osteffen@redhat.com> wrote:
> > >
> > > Quoting Gerd Hoffmann (2023-01-19 13:00:21)
> > > > Hi,
> > > >
> > > > > > I tried the most recent Qemu master (v7.2.50) and also v7.0.0,
> > > > > > on the 5.14 (RHEL) kernel and on 6.1.6-200.fc37.aarch64 (from Fedora).
> > > > > > No luck.
> > > > > >
> > > > >
> > > > > Does that include a backport of commit 406504c7b0405d74d74c15a667cd4c4620c3e7a9?
> > > >
> > > > Probably not given that fedora kernels are almost vanilla and the
> > > > backport landed in stable in v6.1.7 (commit 9a1195c584321).
> > > >
> > > > Oliver: try pulling the latest kernel directly from koji:
> > > > https://koji.fedoraproject.org/koji/buildinfo?buildID=2112315
> > > >
> > > > take care,
> > > > Gerd
> > > >
> > >
> > > Thanks for the koji link, Gerd.
> > >
> > > It works with 6.1.7-200.fc37.aarch64.
> > > Perfect.
> > >
> > > Thanks Ard, thanks Marc!
> > >
> >
> > Thanks for the report. Up to this point, Marc is the only one who had
> > managed to reproduce this afaik, so having another data point where
> > the fix works as intended is rather nice.
> >
>
> I am sorry, this story does not seem to be over yet.
>
> We are using the Erratum patch and also included the commit 406504c7 in
> the kernel.
> Now the firmware crashes sometimes (10 out of 89 tests).
>
Thanks for the report. Is this still on ThunderX2?
> Any hints are very welcome!
>
Do you have access to those build artifacts?
Can you share more of the log output?
If you can rebuild, can you run it with DEBUG_VERBOSE set on ArmCpuDxe. E.g.,
--- a/ArmVirtPkg/ArmVirtQemu.dsc
+++ b/ArmVirtPkg/ArmVirtQemu.dsc
@@ -383,7 +383,11 @@ [Components.common]
#
# Architectural Protocols
#
- ArmPkg/Drivers/CpuDxe/CpuDxe.inf
+ ArmPkg/Drivers/CpuDxe/CpuDxe.inf {
+ <PcdsFixedAtBuild>
+ gEfiMdePkgTokenSpaceGuid.PcdDebugPrintErrorLevel|0x8040004F
+ }
+
MdeModulePkg/Core/RuntimeDxe/RuntimeDxe.inf
MdeModulePkg/Universal/Variable/RuntimeDxe/VariableRuntimeDxe.inf {
<LibraryClasses>
> Here is the serial output of one case:
> ----------------------------------------------------------------
> UEFI firmware (version edk2-20221207gitfff6d81270b5-4.el9.test built at 00:00:00 on Jan 18 2023)
> SyncPcrAllocationsAndPcrMask!
>
>
> Synchronous Exception at 0x000000037FD5BDE0
> PC 0x00037FD5BDE0 (0x00037FD59000+0x00002DE0) [ 0] ArmCpuDxe.dll
> PC 0x00037FD5BDE0 (0x00037FD59000+0x00002DE0) [ 0] ArmCpuDxe.dll
> PC 0x00037FD5BCE0 (0x00037FD59000+0x00002CE0) [ 0] ArmCpuDxe.dll
This, along with
> SP 0x00000000476ED840 ELR 0x000000037FD5BDE0 SPSR 0x80000205 FPSR 0x00000000
> ESR 0x86000005 FAR 0x000000037FD5BDE0
taken from below suggests that UpdateRegionMappingRecursive () is
unmapping itself, likely when splitting a block entry.
This affects all ARM builds, not just ArmVIrtQemu so I would like to
get to the bottom of this.
Thanks,
Ard.
> PC 0x00037FD5C054 (0x00037FD59000+0x00003054) [ 0] ArmCpuDxe.dll
> PC 0x0000476F08EC (0x0000476EE000+0x000028EC) [ 1] DxeCore.dll
> PC 0x0000476F65C0 (0x0000476EE000+0x000085C0) [ 1] DxeCore.dll
> PC 0x0000476FB5B8 (0x0000476EE000+0x0000D5B8) [ 1] DxeCore.dll
> PC 0x0000476F5DB8 (0x0000476EE000+0x00007DB8) [ 1] DxeCore.dll
> PC 0x00037FD5CD50 (0x00037FD59000+0x00003D50) [ 2] ArmCpuDxe.dll
> PC 0x0000476F5758 (0x0000476EE000+0x00007758) [ 3] DxeCore.dll
> PC 0x000047702B18 (0x0000476EE000+0x00014B18) [ 3] DxeCore.dll
> PC 0x0000476F8D70 (0x0000476EE000+0x0000AD70) [ 3] DxeCore.dll
>
> [ 0] /builddir/build/BUILD/edk2-fff6d81270b5/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/ArmPkg/Drivers/CpuDxe/CpuDxe/DEBUG/ArmCpuDxe.dll
> [ 1] /builddir/build/BUILD/edk2-fff6d81270b5/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/MdeModulePkg/Core/Dxe/DxeMain/DEBUG/DxeCore.dll
> [ 2] /builddir/build/BUILD/edk2-fff6d81270b5/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/ArmPkg/Drivers/CpuDxe/CpuDxe/DEBUG/ArmCpuDxe.dll
> [ 3] /builddir/build/BUILD/edk2-fff6d81270b5/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/MdeModulePkg/Core/Dxe/DxeMain/DEBUG/DxeCore.dll
>
> X0 0x0000000047FFE068 X1 0x000000037F179003 X2 0x0000000000340000 X3 0x0000000000000000
> X4 0x0000000000000200 X5 0x0000000000000004 X6 0x0060000000000000 X7 0xFF9F000000000F3F
> X8 0x000000037FFFF008 X9 0x0000000400000000 X10 0x000000037F177000 X11 0x000000037FA37FFF
> X12 0x0000000000000000 X13 0x0000000000000008 X14 0x0000000000000000 X15 0x0000000000000000
> X16 0x000000037FD5A208 X17 0x00000000007979D0 X18 0x0000000000000000 X19 0x0000000340000000
> X20 0x0000000000000001 X21 0x000000037F179003 X22 0x0000000047FFE000 X23 0x0000000000000068
> X24 0x000000003FFFFFFF X25 0x000000037CAA0000 X26 0x0000000000000002 X27 0x000000037F179000
> X28 0x0000000047FFE068 FP 0x00000000476ED840 LR 0x000000037FD5BDE0
>
> V0 0x0000000000000000 0000000000000000 V1 0x0000000000000000 0000000000000000
> V2 0x0000000000000000 0000000000000000 V3 0x0000000000000000 0000000000000000
> V4 0x0000000000000000 0000000000000000 V5 0x0000000000000000 0000000000000000
> V6 0x0000000000000000 0000000000000000 V7 0x0000000000000000 0000000000000000
> V8 0x0000000000000000 0000000000000000 V9 0x0000000000000000 0000000000000000
> V10 0x0000000000000000 0000000000000000 V11 0x0000000000000000 0000000000000000
> V12 0x0000000000000000 0000000000000000 V13 0x0000000000000000 0000000000000000
> V14 0x0000000000000000 0000000000000000 V15 0x0000000000000000 0000000000000000
> V16 0x0000000000000000 0000000000000000 V17 0x0000000000000000 0000000000000000
> V18 0x0000000000000000 0000000000000000 V19 0x0000000000000000 0000000000000000
> V20 0x0000000000000000 0000000000000000 V21 0x0000000000000000 0000000000000000
> V22 0x0000000000000000 0000000000000000 V23 0x0000000000000000 0000000000000000
> V24 0x0000000000000000 0000000000000000 V25 0x0000000000000000 0000000000000000
> V26 0x0000000000000000 0000000000000000 V27 0x0000000000000000 0000000000000000
> V28 0x0000000000000000 0000000000000000 V29 0x0000000000000000 0000000000000000
> V30 0x0000000000000000 0000000000000000 V31 0x0000000000000000 0000000000000000
>
> SP 0x00000000476ED840 ELR 0x000000037FD5BDE0 SPSR 0x80000205 FPSR 0x00000000
> ESR 0x86000005 FAR 0x000000037FD5BDE0
>
> ESR : EC 0x21 IL 0x1 ISS 0x00000005
>
> Instruction abort: Translation fault, first level
>
> Stack dump:
> 00000476ED740: 000000037CAA0000 000000037CAA0000 0060000000000000 FF9F000000000F3F
> 00000476ED760: 00000000476ED790 000000037FD610D8 0060000000000003 0000000C00000001
> 00000476ED780: 000000037CA0070D 000000037F179000 00000000476ED840 000000037FD5BCE0
> 00000476ED7A0: 0000000340000000 0000000000000001 000000037F179000 0000000047FFE000
> 00000476ED7C0: 0000000000000068 000000003FFFFFFF 000000037CAA0000 0000000000000002
> 00000476ED7E0: 000000037F179000 0000000047FFE068 000000037CC00000 000000037CAA0000
> 00000476ED800: 0060000000000000 FF9F000000000F3F 00000000476ED840 000000037FD610D8
> 00000476ED820: 0060000000000001 0000001500000001 000000034000070D 000000037F177000
> > 00000476ED840: 00000000476ED8F0 000000037FD5BCE0 0000000047FFA000 0000000000000000
> 00000476ED860: 0000000047FFE000 0000000047FFF000 0000000000000000 0000007FFFFFFFFF
> 00000476ED880: 000000037CAA0000 0000000000000001 0000000047717588 0000000047FFF000
> 00000476ED8A0: 0000000380000000 000000037CAA0000 0060000000000000 FF9F000000000F3F
> 00000476ED8C0: 000000017FD605DD 000000037FD610D8 0060000000000001 0000001E00000001
> 00000476ED8E0: 0060000000000000 000000037F179000 00000000476ED9A0 000000037FD5C054
> 00000476ED900: 002000000000041C 0000000000000000 0000000047FFA000 0000000000004000
> 00000476ED920: 0000000334AA6000 0000000047FFF000 000000037F17A238 0000000047717000
> ASSERT [ArmCpuDxe] /builddir/build/BUILD/edk2-fff6d81270b5/ArmPkg/Library/DefaultExceptionHandlerLib/AArch64/DefaultExceptionHandler.c(333): ((BOOLEAN)(0==1))
> ----------------------------------------------------------------
> Thanks,
> Oliver
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-02-01 11:51 ` Ard Biesheuvel
@ 2023-02-01 12:58 ` Oliver Steffen
2023-02-01 13:29 ` Ard Biesheuvel
0 siblings, 1 reply; 37+ messages in thread
From: Oliver Steffen @ 2023-02-01 12:58 UTC (permalink / raw)
To: Ard Biesheuvel; +Cc: devel, Gerd Hoffmann, Marc Zyngier, dann.frazier
[-- Attachment #1: Type: text/plain, Size: 9087 bytes --]
On Wed, Feb 1, 2023 at 12:52 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> On Wed, 1 Feb 2023 at 10:14, Oliver Steffen <osteffen@redhat.com> wrote:
> >
> > Hi everyone!
> >
> > On Thu, Jan 19, 2023 at 2:21 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> > >
> > > On Thu, 19 Jan 2023 at 13:55, Oliver Steffen <osteffen@redhat.com>
> wrote:
> > > >
> > > > Quoting Gerd Hoffmann (2023-01-19 13:00:21)
> > > > > Hi,
> > > > >
> > > > > > > I tried the most recent Qemu master (v7.2.50) and also v7.0.0,
> > > > > > > on the 5.14 (RHEL) kernel and on 6.1.6-200.fc37.aarch64 (from
> Fedora).
> > > > > > > No luck.
> > > > > > >
> > > > > >
> > > > > > Does that include a backport of commit
> 406504c7b0405d74d74c15a667cd4c4620c3e7a9?
> > > > >
> > > > > Probably not given that fedora kernels are almost vanilla and the
> > > > > backport landed in stable in v6.1.7 (commit 9a1195c584321).
> > > > >
> > > > > Oliver: try pulling the latest kernel directly from koji:
> > > > > https://koji.fedoraproject.org/koji/buildinfo?buildID=2112315
> > > > >
> > > > > take care,
> > > > > Gerd
> > > > >
> > > >
> > > > Thanks for the koji link, Gerd.
> > > >
> > > > It works with 6.1.7-200.fc37.aarch64.
> > > > Perfect.
> > > >
> > > > Thanks Ard, thanks Marc!
> > > >
> > >
> > > Thanks for the report. Up to this point, Marc is the only one who had
> > > managed to reproduce this afaik, so having another data point where
> > > the fix works as intended is rather nice.
> > >
> >
> > I am sorry, this story does not seem to be over yet.
> >
> > We are using the Erratum patch and also included the commit 406504c7 in
> > the kernel.
> > Now the firmware crashes sometimes (10 out of 89 tests).
> >
>
> Thanks for the report. Is this still on ThunderX2?
>
> > Any hints are very welcome!
> >
>
> Do you have access to those build artifacts?
>
https://kojihub.stream.centos.org/kojifiles/work/tasks/5251/1835251/edk2-aarch64-20221207gitfff6d81270b5-4.el9.test.noarch.rpm
and/or here:
https://kojihub.stream.centos.org/koji/taskinfo?taskID=1835251
Source for reference:
https://gitlab.com/redhat/centos-stream/src/edk2/-/merge_requests/24
Can you share more of the log output?
> If you can rebuild, can you run it with DEBUG_VERBOSE set on ArmCpuDxe.
> E.g.,
>
> --- a/ArmVirtPkg/ArmVirtQemu.dsc
> +++ b/ArmVirtPkg/ArmVirtQemu.dsc
> @@ -383,7 +383,11 @@ [Components.common]
> #
> # Architectural Protocols
> #
> - ArmPkg/Drivers/CpuDxe/CpuDxe.inf
> + ArmPkg/Drivers/CpuDxe/CpuDxe.inf {
> + <PcdsFixedAtBuild>
> + gEfiMdePkgTokenSpaceGuid.PcdDebugPrintErrorLevel|0x8040004F
> + }
> +
> MdeModulePkg/Core/RuntimeDxe/RuntimeDxe.inf
> MdeModulePkg/Universal/Variable/RuntimeDxe/VariableRuntimeDxe.inf {
> <LibraryClasses>
>
I'll try to do that.
> > Here is the serial output of one case:
> > ----------------------------------------------------------------
> > UEFI firmware (version edk2-20221207gitfff6d81270b5-4.el9.test built at
> 00:00:00 on Jan 18 2023)
> > SyncPcrAllocationsAndPcrMask!
> >
> >
> > Synchronous Exception at 0x000000037FD5BDE0
> > PC 0x00037FD5BDE0 (0x00037FD59000+0x00002DE0) [ 0] ArmCpuDxe.dll
> > PC 0x00037FD5BDE0 (0x00037FD59000+0x00002DE0) [ 0] ArmCpuDxe.dll
> > PC 0x00037FD5BCE0 (0x00037FD59000+0x00002CE0) [ 0] ArmCpuDxe.dll
>
> This, along with
>
> > SP 0x00000000476ED840 ELR 0x000000037FD5BDE0 SPSR 0x80000205 FPSR
> 0x00000000
> > ESR 0x86000005 FAR 0x000000037FD5BDE0
>
> taken from below suggests that UpdateRegionMappingRecursive () is
> unmapping itself, likely when splitting a block entry.
>
> This affects all ARM builds, not just ArmVIrtQemu so I would like to
> get to the bottom of this.
>
>
> Thanks,
> Ard.
>
>
>
> > PC 0x00037FD5C054 (0x00037FD59000+0x00003054) [ 0] ArmCpuDxe.dll
> > PC 0x0000476F08EC (0x0000476EE000+0x000028EC) [ 1] DxeCore.dll
> > PC 0x0000476F65C0 (0x0000476EE000+0x000085C0) [ 1] DxeCore.dll
> > PC 0x0000476FB5B8 (0x0000476EE000+0x0000D5B8) [ 1] DxeCore.dll
> > PC 0x0000476F5DB8 (0x0000476EE000+0x00007DB8) [ 1] DxeCore.dll
> > PC 0x00037FD5CD50 (0x00037FD59000+0x00003D50) [ 2] ArmCpuDxe.dll
> > PC 0x0000476F5758 (0x0000476EE000+0x00007758) [ 3] DxeCore.dll
> > PC 0x000047702B18 (0x0000476EE000+0x00014B18) [ 3] DxeCore.dll
> > PC 0x0000476F8D70 (0x0000476EE000+0x0000AD70) [ 3] DxeCore.dll
> >
> > [ 0]
> /builddir/build/BUILD/edk2-fff6d81270b5/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/ArmPkg/Drivers/CpuDxe/CpuDxe/DEBUG/ArmCpuDxe.dll
> > [ 1]
> /builddir/build/BUILD/edk2-fff6d81270b5/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/MdeModulePkg/Core/Dxe/DxeMain/DEBUG/DxeCore.dll
> > [ 2]
> /builddir/build/BUILD/edk2-fff6d81270b5/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/ArmPkg/Drivers/CpuDxe/CpuDxe/DEBUG/ArmCpuDxe.dll
> > [ 3]
> /builddir/build/BUILD/edk2-fff6d81270b5/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/MdeModulePkg/Core/Dxe/DxeMain/DEBUG/DxeCore.dll
> >
> > X0 0x0000000047FFE068 X1 0x000000037F179003 X2 0x0000000000340000
> X3 0x0000000000000000
> > X4 0x0000000000000200 X5 0x0000000000000004 X6 0x0060000000000000
> X7 0xFF9F000000000F3F
> > X8 0x000000037FFFF008 X9 0x0000000400000000 X10 0x000000037F177000
> X11 0x000000037FA37FFF
> > X12 0x0000000000000000 X13 0x0000000000000008 X14 0x0000000000000000
> X15 0x0000000000000000
> > X16 0x000000037FD5A208 X17 0x00000000007979D0 X18 0x0000000000000000
> X19 0x0000000340000000
> > X20 0x0000000000000001 X21 0x000000037F179003 X22 0x0000000047FFE000
> X23 0x0000000000000068
> > X24 0x000000003FFFFFFF X25 0x000000037CAA0000 X26 0x0000000000000002
> X27 0x000000037F179000
> > X28 0x0000000047FFE068 FP 0x00000000476ED840 LR 0x000000037FD5BDE0
> >
> > V0 0x0000000000000000 0000000000000000 V1 0x0000000000000000
> 0000000000000000
> > V2 0x0000000000000000 0000000000000000 V3 0x0000000000000000
> 0000000000000000
> > V4 0x0000000000000000 0000000000000000 V5 0x0000000000000000
> 0000000000000000
> > V6 0x0000000000000000 0000000000000000 V7 0x0000000000000000
> 0000000000000000
> > V8 0x0000000000000000 0000000000000000 V9 0x0000000000000000
> 0000000000000000
> > V10 0x0000000000000000 0000000000000000 V11 0x0000000000000000
> 0000000000000000
> > V12 0x0000000000000000 0000000000000000 V13 0x0000000000000000
> 0000000000000000
> > V14 0x0000000000000000 0000000000000000 V15 0x0000000000000000
> 0000000000000000
> > V16 0x0000000000000000 0000000000000000 V17 0x0000000000000000
> 0000000000000000
> > V18 0x0000000000000000 0000000000000000 V19 0x0000000000000000
> 0000000000000000
> > V20 0x0000000000000000 0000000000000000 V21 0x0000000000000000
> 0000000000000000
> > V22 0x0000000000000000 0000000000000000 V23 0x0000000000000000
> 0000000000000000
> > V24 0x0000000000000000 0000000000000000 V25 0x0000000000000000
> 0000000000000000
> > V26 0x0000000000000000 0000000000000000 V27 0x0000000000000000
> 0000000000000000
> > V28 0x0000000000000000 0000000000000000 V29 0x0000000000000000
> 0000000000000000
> > V30 0x0000000000000000 0000000000000000 V31 0x0000000000000000
> 0000000000000000
> >
> > SP 0x00000000476ED840 ELR 0x000000037FD5BDE0 SPSR 0x80000205 FPSR
> 0x00000000
> > ESR 0x86000005 FAR 0x000000037FD5BDE0
> >
> > ESR : EC 0x21 IL 0x1 ISS 0x00000005
> >
> > Instruction abort: Translation fault, first level
> >
> > Stack dump:
> > 00000476ED740: 000000037CAA0000 000000037CAA0000 0060000000000000
> FF9F000000000F3F
> > 00000476ED760: 00000000476ED790 000000037FD610D8 0060000000000003
> 0000000C00000001
> > 00000476ED780: 000000037CA0070D 000000037F179000 00000000476ED840
> 000000037FD5BCE0
> > 00000476ED7A0: 0000000340000000 0000000000000001 000000037F179000
> 0000000047FFE000
> > 00000476ED7C0: 0000000000000068 000000003FFFFFFF 000000037CAA0000
> 0000000000000002
> > 00000476ED7E0: 000000037F179000 0000000047FFE068 000000037CC00000
> 000000037CAA0000
> > 00000476ED800: 0060000000000000 FF9F000000000F3F 00000000476ED840
> 000000037FD610D8
> > 00000476ED820: 0060000000000001 0000001500000001 000000034000070D
> 000000037F177000
> > > 00000476ED840: 00000000476ED8F0 000000037FD5BCE0 0000000047FFA000
> 0000000000000000
> > 00000476ED860: 0000000047FFE000 0000000047FFF000 0000000000000000
> 0000007FFFFFFFFF
> > 00000476ED880: 000000037CAA0000 0000000000000001 0000000047717588
> 0000000047FFF000
> > 00000476ED8A0: 0000000380000000 000000037CAA0000 0060000000000000
> FF9F000000000F3F
> > 00000476ED8C0: 000000017FD605DD 000000037FD610D8 0060000000000001
> 0000001E00000001
> > 00000476ED8E0: 0060000000000000 000000037F179000 00000000476ED9A0
> 000000037FD5C054
> > 00000476ED900: 002000000000041C 0000000000000000 0000000047FFA000
> 0000000000004000
> > 00000476ED920: 0000000334AA6000 0000000047FFF000 000000037F17A238
> 0000000047717000
> > ASSERT [ArmCpuDxe]
> /builddir/build/BUILD/edk2-fff6d81270b5/ArmPkg/Library/DefaultExceptionHandlerLib/AArch64/DefaultExceptionHandler.c(333):
> ((BOOLEAN)(0==1))
> > ----------------------------------------------------------------
> > Thanks,
> > Oliver
> >
>
>
[-- Attachment #2: Type: text/html, Size: 12310 bytes --]
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-02-01 12:58 ` Oliver Steffen
@ 2023-02-01 13:29 ` Ard Biesheuvel
2023-02-02 11:09 ` Oliver Steffen
[not found] ` <173FFD60429C89C3.3213@groups.io>
0 siblings, 2 replies; 37+ messages in thread
From: Ard Biesheuvel @ 2023-02-01 13:29 UTC (permalink / raw)
To: devel, osteffen; +Cc: Gerd Hoffmann, Marc Zyngier, dann.frazier
On Wed, 1 Feb 2023 at 13:59, Oliver Steffen <osteffen@redhat.com> wrote:
>
>
>
> On Wed, Feb 1, 2023 at 12:52 PM Ard Biesheuvel <ardb@kernel.org> wrote:
>>
>> On Wed, 1 Feb 2023 at 10:14, Oliver Steffen <osteffen@redhat.com> wrote:
>> >
>> > Hi everyone!
>> >
>> > On Thu, Jan 19, 2023 at 2:21 PM Ard Biesheuvel <ardb@kernel.org> wrote:
>> > >
>> > > On Thu, 19 Jan 2023 at 13:55, Oliver Steffen <osteffen@redhat.com> wrote:
>> > > >
>> > > > Quoting Gerd Hoffmann (2023-01-19 13:00:21)
>> > > > > Hi,
>> > > > >
>> > > > > > > I tried the most recent Qemu master (v7.2.50) and also v7.0.0,
>> > > > > > > on the 5.14 (RHEL) kernel and on 6.1.6-200.fc37.aarch64 (from Fedora).
>> > > > > > > No luck.
>> > > > > > >
>> > > > > >
>> > > > > > Does that include a backport of commit 406504c7b0405d74d74c15a667cd4c4620c3e7a9?
>> > > > >
>> > > > > Probably not given that fedora kernels are almost vanilla and the
>> > > > > backport landed in stable in v6.1.7 (commit 9a1195c584321).
>> > > > >
>> > > > > Oliver: try pulling the latest kernel directly from koji:
>> > > > > https://koji.fedoraproject.org/koji/buildinfo?buildID=2112315
>> > > > >
>> > > > > take care,
>> > > > > Gerd
>> > > > >
>> > > >
>> > > > Thanks for the koji link, Gerd.
>> > > >
>> > > > It works with 6.1.7-200.fc37.aarch64.
>> > > > Perfect.
>> > > >
>> > > > Thanks Ard, thanks Marc!
>> > > >
>> > >
>> > > Thanks for the report. Up to this point, Marc is the only one who had
>> > > managed to reproduce this afaik, so having another data point where
>> > > the fix works as intended is rather nice.
>> > >
>> >
>> > I am sorry, this story does not seem to be over yet.
>> >
>> > We are using the Erratum patch and also included the commit 406504c7 in
>> > the kernel.
>> > Now the firmware crashes sometimes (10 out of 89 tests).
>> >
>>
>> Thanks for the report. Is this still on ThunderX2?
>>
>> > Any hints are very welcome!
>> >
>>
>> Do you have access to those build artifacts?
>
>
> https://kojihub.stream.centos.org/kojifiles/work/tasks/5251/1835251/edk2-aarch64-20221207gitfff6d81270b5-4.el9.test.noarch.rpm
>
> and/or here:
>
> https://kojihub.stream.centos.org/koji/taskinfo?taskID=1835251
>
> Source for reference:
> https://gitlab.com/redhat/centos-stream/src/edk2/-/merge_requests/24
>
Any chance the .dll files (which are actually ELF executables) have
been preserved somewhere?
>> Can you share more of the log output?
>> If you can rebuild, can you run it with DEBUG_VERBOSE set on ArmCpuDxe. E.g.,
>>
>> --- a/ArmVirtPkg/ArmVirtQemu.dsc
>> +++ b/ArmVirtPkg/ArmVirtQemu.dsc
>> @@ -383,7 +383,11 @@ [Components.common]
>> #
>> # Architectural Protocols
>> #
>> - ArmPkg/Drivers/CpuDxe/CpuDxe.inf
>> + ArmPkg/Drivers/CpuDxe/CpuDxe.inf {
>> + <PcdsFixedAtBuild>
>> + gEfiMdePkgTokenSpaceGuid.PcdDebugPrintErrorLevel|0x8040004F
>> + }
>> +
>> MdeModulePkg/Core/RuntimeDxe/RuntimeDxe.inf
>> MdeModulePkg/Universal/Variable/RuntimeDxe/VariableRuntimeDxe.inf {
>> <LibraryClasses>
>
> I'll try to do that.
>
>
>>
>> > Here is the serial output of one case:
>> > ----------------------------------------------------------------
>> > UEFI firmware (version edk2-20221207gitfff6d81270b5-4.el9.test built at 00:00:00 on Jan 18 2023)
>> > SyncPcrAllocationsAndPcrMask!
>> >
>> >
>> > Synchronous Exception at 0x000000037FD5BDE0
>> > PC 0x00037FD5BDE0 (0x00037FD59000+0x00002DE0) [ 0] ArmCpuDxe.dll
>> > PC 0x00037FD5BDE0 (0x00037FD59000+0x00002DE0) [ 0] ArmCpuDxe.dll
>> > PC 0x00037FD5BCE0 (0x00037FD59000+0x00002CE0) [ 0] ArmCpuDxe.dll
>>
>> This, along with
>>
>> > SP 0x00000000476ED840 ELR 0x000000037FD5BDE0 SPSR 0x80000205 FPSR 0x00000000
>> > ESR 0x86000005 FAR 0x000000037FD5BDE0
>>
>> taken from below suggests that UpdateRegionMappingRecursive () is
>> unmapping itself, likely when splitting a block entry.
>>
>> This affects all ARM builds, not just ArmVIrtQemu so I would like to
>> get to the bottom of this.
>>
>>
>> Thanks,
>> Ard.
>>
>>
>>
>> > PC 0x00037FD5C054 (0x00037FD59000+0x00003054) [ 0] ArmCpuDxe.dll
>> > PC 0x0000476F08EC (0x0000476EE000+0x000028EC) [ 1] DxeCore.dll
>> > PC 0x0000476F65C0 (0x0000476EE000+0x000085C0) [ 1] DxeCore.dll
>> > PC 0x0000476FB5B8 (0x0000476EE000+0x0000D5B8) [ 1] DxeCore.dll
>> > PC 0x0000476F5DB8 (0x0000476EE000+0x00007DB8) [ 1] DxeCore.dll
>> > PC 0x00037FD5CD50 (0x00037FD59000+0x00003D50) [ 2] ArmCpuDxe.dll
>> > PC 0x0000476F5758 (0x0000476EE000+0x00007758) [ 3] DxeCore.dll
>> > PC 0x000047702B18 (0x0000476EE000+0x00014B18) [ 3] DxeCore.dll
>> > PC 0x0000476F8D70 (0x0000476EE000+0x0000AD70) [ 3] DxeCore.dll
>> >
>> > [ 0] /builddir/build/BUILD/edk2-fff6d81270b5/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/ArmPkg/Drivers/CpuDxe/CpuDxe/DEBUG/ArmCpuDxe.dll
>> > [ 1] /builddir/build/BUILD/edk2-fff6d81270b5/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/MdeModulePkg/Core/Dxe/DxeMain/DEBUG/DxeCore.dll
>> > [ 2] /builddir/build/BUILD/edk2-fff6d81270b5/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/ArmPkg/Drivers/CpuDxe/CpuDxe/DEBUG/ArmCpuDxe.dll
>> > [ 3] /builddir/build/BUILD/edk2-fff6d81270b5/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/MdeModulePkg/Core/Dxe/DxeMain/DEBUG/DxeCore.dll
>> >
>> > X0 0x0000000047FFE068 X1 0x000000037F179003 X2 0x0000000000340000 X3 0x0000000000000000
>> > X4 0x0000000000000200 X5 0x0000000000000004 X6 0x0060000000000000 X7 0xFF9F000000000F3F
>> > X8 0x000000037FFFF008 X9 0x0000000400000000 X10 0x000000037F177000 X11 0x000000037FA37FFF
>> > X12 0x0000000000000000 X13 0x0000000000000008 X14 0x0000000000000000 X15 0x0000000000000000
>> > X16 0x000000037FD5A208 X17 0x00000000007979D0 X18 0x0000000000000000 X19 0x0000000340000000
>> > X20 0x0000000000000001 X21 0x000000037F179003 X22 0x0000000047FFE000 X23 0x0000000000000068
>> > X24 0x000000003FFFFFFF X25 0x000000037CAA0000 X26 0x0000000000000002 X27 0x000000037F179000
>> > X28 0x0000000047FFE068 FP 0x00000000476ED840 LR 0x000000037FD5BDE0
>> >
>> > V0 0x0000000000000000 0000000000000000 V1 0x0000000000000000 0000000000000000
>> > V2 0x0000000000000000 0000000000000000 V3 0x0000000000000000 0000000000000000
>> > V4 0x0000000000000000 0000000000000000 V5 0x0000000000000000 0000000000000000
>> > V6 0x0000000000000000 0000000000000000 V7 0x0000000000000000 0000000000000000
>> > V8 0x0000000000000000 0000000000000000 V9 0x0000000000000000 0000000000000000
>> > V10 0x0000000000000000 0000000000000000 V11 0x0000000000000000 0000000000000000
>> > V12 0x0000000000000000 0000000000000000 V13 0x0000000000000000 0000000000000000
>> > V14 0x0000000000000000 0000000000000000 V15 0x0000000000000000 0000000000000000
>> > V16 0x0000000000000000 0000000000000000 V17 0x0000000000000000 0000000000000000
>> > V18 0x0000000000000000 0000000000000000 V19 0x0000000000000000 0000000000000000
>> > V20 0x0000000000000000 0000000000000000 V21 0x0000000000000000 0000000000000000
>> > V22 0x0000000000000000 0000000000000000 V23 0x0000000000000000 0000000000000000
>> > V24 0x0000000000000000 0000000000000000 V25 0x0000000000000000 0000000000000000
>> > V26 0x0000000000000000 0000000000000000 V27 0x0000000000000000 0000000000000000
>> > V28 0x0000000000000000 0000000000000000 V29 0x0000000000000000 0000000000000000
>> > V30 0x0000000000000000 0000000000000000 V31 0x0000000000000000 0000000000000000
>> >
>> > SP 0x00000000476ED840 ELR 0x000000037FD5BDE0 SPSR 0x80000205 FPSR 0x00000000
>> > ESR 0x86000005 FAR 0x000000037FD5BDE0
>> >
>> > ESR : EC 0x21 IL 0x1 ISS 0x00000005
>> >
>> > Instruction abort: Translation fault, first level
>> >
>> > Stack dump:
>> > 00000476ED740: 000000037CAA0000 000000037CAA0000 0060000000000000 FF9F000000000F3F
>> > 00000476ED760: 00000000476ED790 000000037FD610D8 0060000000000003 0000000C00000001
>> > 00000476ED780: 000000037CA0070D 000000037F179000 00000000476ED840 000000037FD5BCE0
>> > 00000476ED7A0: 0000000340000000 0000000000000001 000000037F179000 0000000047FFE000
>> > 00000476ED7C0: 0000000000000068 000000003FFFFFFF 000000037CAA0000 0000000000000002
>> > 00000476ED7E0: 000000037F179000 0000000047FFE068 000000037CC00000 000000037CAA0000
>> > 00000476ED800: 0060000000000000 FF9F000000000F3F 00000000476ED840 000000037FD610D8
>> > 00000476ED820: 0060000000000001 0000001500000001 000000034000070D 000000037F177000
>> > > 00000476ED840: 00000000476ED8F0 000000037FD5BCE0 0000000047FFA000 0000000000000000
>> > 00000476ED860: 0000000047FFE000 0000000047FFF000 0000000000000000 0000007FFFFFFFFF
>> > 00000476ED880: 000000037CAA0000 0000000000000001 0000000047717588 0000000047FFF000
>> > 00000476ED8A0: 0000000380000000 000000037CAA0000 0060000000000000 FF9F000000000F3F
>> > 00000476ED8C0: 000000017FD605DD 000000037FD610D8 0060000000000001 0000001E00000001
>> > 00000476ED8E0: 0060000000000000 000000037F179000 00000000476ED9A0 000000037FD5C054
>> > 00000476ED900: 002000000000041C 0000000000000000 0000000047FFA000 0000000000004000
>> > 00000476ED920: 0000000334AA6000 0000000047FFF000 000000037F17A238 0000000047717000
>> > ASSERT [ArmCpuDxe] /builddir/build/BUILD/edk2-fff6d81270b5/ArmPkg/Library/DefaultExceptionHandlerLib/AArch64/DefaultExceptionHandler.c(333): ((BOOLEAN)(0==1))
>> > ----------------------------------------------------------------
>> > Thanks,
>> > Oliver
>> >
>>
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-02-01 13:29 ` Ard Biesheuvel
@ 2023-02-02 11:09 ` Oliver Steffen
[not found] ` <173FFD60429C89C3.3213@groups.io>
1 sibling, 0 replies; 37+ messages in thread
From: Oliver Steffen @ 2023-02-02 11:09 UTC (permalink / raw)
To: Ard Biesheuvel; +Cc: devel, Gerd Hoffmann, Marc Zyngier, dann.frazier
[-- Attachment #1: Type: text/plain, Size: 8684 bytes --]
On Wed, Feb 1, 2023 at 2:29 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> On Wed, 1 Feb 2023 at 13:59, Oliver Steffen <osteffen@redhat.com> wrote:
> >
> > On Wed, Feb 1, 2023 at 12:52 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> >>
> >> On Wed, 1 Feb 2023 at 10:14, Oliver Steffen <osteffen@redhat.com>
> wrote:
> >> >
>
[...]
> >> > I am sorry, this story does not seem to be over yet.
> >> >
> >> > We are using the Erratum patch and also included the commit 406504c7
> in
> >> > the kernel.
> >> > Now the firmware crashes sometimes (10 out of 89 tests).
> >> >
> >>
> >> Thanks for the report. Is this still on ThunderX2?
> >>
> >> > Any hints are very welcome!
> >> >
> >>
> >> Do you have access to those build artifacts?
> >
> >
> >
> https://kojihub.stream.centos.org/kojifiles/work/tasks/5251/1835251/edk2-aarch64-20221207gitfff6d81270b5-4.el9.test.noarch.rpm
> >
> > and/or here:
> >
> > https://kojihub.stream.centos.org/koji/taskinfo?taskID=1835251
> >
> > Source for reference:
> > https://gitlab.com/redhat/centos-stream/src/edk2/-/merge_requests/24
> >
>
> Any chance the .dll files (which are actually ELF executables) have
> been preserved somewhere?
>
Here is the build folder (~90MB):
https://gitlab.com/osteffen/thunderx2-debug/-/raw/main/armvirt-thunderx2-issue.tar.xz
I am waiting for the tests with the additional debug output to run.
> >> Can you share more of the log output?
> >> If you can rebuild, can you run it with DEBUG_VERBOSE set on ArmCpuDxe.
> E.g.,
> >>
> >> --- a/ArmVirtPkg/ArmVirtQemu.dsc
> >> +++ b/ArmVirtPkg/ArmVirtQemu.dsc
> >> @@ -383,7 +383,11 @@ [Components.common]
> >> #
> >> # Architectural Protocols
> >> #
> >> - ArmPkg/Drivers/CpuDxe/CpuDxe.inf
> >> + ArmPkg/Drivers/CpuDxe/CpuDxe.inf {
> >> + <PcdsFixedAtBuild>
> >> + gEfiMdePkgTokenSpaceGuid.PcdDebugPrintErrorLevel|0x8040004F
> >> + }
> >> +
> >> MdeModulePkg/Core/RuntimeDxe/RuntimeDxe.inf
> >> MdeModulePkg/Universal/Variable/RuntimeDxe/VariableRuntimeDxe.inf {
> >> <LibraryClasses>
> >
> > I'll try to do that.
> >
> >
> >>
> >> > Here is the serial output of one case:
> >> > ----------------------------------------------------------------
> >> > UEFI firmware (version edk2-20221207gitfff6d81270b5-4.el9.test built
> at 00:00:00 on Jan 18 2023)
> >> > SyncPcrAllocationsAndPcrMask!
> >> >
> >> >
> >> > Synchronous Exception at 0x000000037FD5BDE0
> >> > PC 0x00037FD5BDE0 (0x00037FD59000+0x00002DE0) [ 0] ArmCpuDxe.dll
> >> > PC 0x00037FD5BDE0 (0x00037FD59000+0x00002DE0) [ 0] ArmCpuDxe.dll
> >> > PC 0x00037FD5BCE0 (0x00037FD59000+0x00002CE0) [ 0] ArmCpuDxe.dll
> >>
> >> This, along with
> >>
> >> > SP 0x00000000476ED840 ELR 0x000000037FD5BDE0 SPSR 0x80000205
> FPSR 0x00000000
> >> > ESR 0x86000005 FAR 0x000000037FD5BDE0
> >>
> >> taken from below suggests that UpdateRegionMappingRecursive () is
> >> unmapping itself, likely when splitting a block entry.
> >>
> >> This affects all ARM builds, not just ArmVIrtQemu so I would like to
> >> get to the bottom of this.
> >>
> >>
> >> Thanks,
> >> Ard.
> >>
> >>
> >>
> >> > PC 0x00037FD5C054 (0x00037FD59000+0x00003054) [ 0] ArmCpuDxe.dll
> >> > PC 0x0000476F08EC (0x0000476EE000+0x000028EC) [ 1] DxeCore.dll
> >> > PC 0x0000476F65C0 (0x0000476EE000+0x000085C0) [ 1] DxeCore.dll
> >> > PC 0x0000476FB5B8 (0x0000476EE000+0x0000D5B8) [ 1] DxeCore.dll
> >> > PC 0x0000476F5DB8 (0x0000476EE000+0x00007DB8) [ 1] DxeCore.dll
> >> > PC 0x00037FD5CD50 (0x00037FD59000+0x00003D50) [ 2] ArmCpuDxe.dll
> >> > PC 0x0000476F5758 (0x0000476EE000+0x00007758) [ 3] DxeCore.dll
> >> > PC 0x000047702B18 (0x0000476EE000+0x00014B18) [ 3] DxeCore.dll
> >> > PC 0x0000476F8D70 (0x0000476EE000+0x0000AD70) [ 3] DxeCore.dll
> >> >
> >> > [ 0]
> /builddir/build/BUILD/edk2-fff6d81270b5/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/ArmPkg/Drivers/CpuDxe/CpuDxe/DEBUG/ArmCpuDxe.dll
> >> > [ 1]
> /builddir/build/BUILD/edk2-fff6d81270b5/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/MdeModulePkg/Core/Dxe/DxeMain/DEBUG/DxeCore.dll
> >> > [ 2]
> /builddir/build/BUILD/edk2-fff6d81270b5/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/ArmPkg/Drivers/CpuDxe/CpuDxe/DEBUG/ArmCpuDxe.dll
> >> > [ 3]
> /builddir/build/BUILD/edk2-fff6d81270b5/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/MdeModulePkg/Core/Dxe/DxeMain/DEBUG/DxeCore.dll
> >> >
> >> > X0 0x0000000047FFE068 X1 0x000000037F179003 X2
> 0x0000000000340000 X3 0x0000000000000000
> >> > X4 0x0000000000000200 X5 0x0000000000000004 X6
> 0x0060000000000000 X7 0xFF9F000000000F3F
> >> > X8 0x000000037FFFF008 X9 0x0000000400000000 X10
> 0x000000037F177000 X11 0x000000037FA37FFF
> >> > X12 0x0000000000000000 X13 0x0000000000000008 X14
> 0x0000000000000000 X15 0x0000000000000000
> >> > X16 0x000000037FD5A208 X17 0x00000000007979D0 X18
> 0x0000000000000000 X19 0x0000000340000000
> >> > X20 0x0000000000000001 X21 0x000000037F179003 X22
> 0x0000000047FFE000 X23 0x0000000000000068
> >> > X24 0x000000003FFFFFFF X25 0x000000037CAA0000 X26
> 0x0000000000000002 X27 0x000000037F179000
> >> > X28 0x0000000047FFE068 FP 0x00000000476ED840 LR
> 0x000000037FD5BDE0
> >> >
> >> > V0 0x0000000000000000 0000000000000000 V1 0x0000000000000000
> 0000000000000000
> >> > V2 0x0000000000000000 0000000000000000 V3 0x0000000000000000
> 0000000000000000
> >> > V4 0x0000000000000000 0000000000000000 V5 0x0000000000000000
> 0000000000000000
> >> > V6 0x0000000000000000 0000000000000000 V7 0x0000000000000000
> 0000000000000000
> >> > V8 0x0000000000000000 0000000000000000 V9 0x0000000000000000
> 0000000000000000
> >> > V10 0x0000000000000000 0000000000000000 V11 0x0000000000000000
> 0000000000000000
> >> > V12 0x0000000000000000 0000000000000000 V13 0x0000000000000000
> 0000000000000000
> >> > V14 0x0000000000000000 0000000000000000 V15 0x0000000000000000
> 0000000000000000
> >> > V16 0x0000000000000000 0000000000000000 V17 0x0000000000000000
> 0000000000000000
> >> > V18 0x0000000000000000 0000000000000000 V19 0x0000000000000000
> 0000000000000000
> >> > V20 0x0000000000000000 0000000000000000 V21 0x0000000000000000
> 0000000000000000
> >> > V22 0x0000000000000000 0000000000000000 V23 0x0000000000000000
> 0000000000000000
> >> > V24 0x0000000000000000 0000000000000000 V25 0x0000000000000000
> 0000000000000000
> >> > V26 0x0000000000000000 0000000000000000 V27 0x0000000000000000
> 0000000000000000
> >> > V28 0x0000000000000000 0000000000000000 V29 0x0000000000000000
> 0000000000000000
> >> > V30 0x0000000000000000 0000000000000000 V31 0x0000000000000000
> 0000000000000000
> >> >
> >> > SP 0x00000000476ED840 ELR 0x000000037FD5BDE0 SPSR 0x80000205
> FPSR 0x00000000
> >> > ESR 0x86000005 FAR 0x000000037FD5BDE0
> >> >
> >> > ESR : EC 0x21 IL 0x1 ISS 0x00000005
> >> >
> >> > Instruction abort: Translation fault, first level
> >> >
> >> > Stack dump:
> >> > 00000476ED740: 000000037CAA0000 000000037CAA0000 0060000000000000
> FF9F000000000F3F
> >> > 00000476ED760: 00000000476ED790 000000037FD610D8 0060000000000003
> 0000000C00000001
> >> > 00000476ED780: 000000037CA0070D 000000037F179000 00000000476ED840
> 000000037FD5BCE0
> >> > 00000476ED7A0: 0000000340000000 0000000000000001 000000037F179000
> 0000000047FFE000
> >> > 00000476ED7C0: 0000000000000068 000000003FFFFFFF 000000037CAA0000
> 0000000000000002
> >> > 00000476ED7E0: 000000037F179000 0000000047FFE068 000000037CC00000
> 000000037CAA0000
> >> > 00000476ED800: 0060000000000000 FF9F000000000F3F 00000000476ED840
> 000000037FD610D8
> >> > 00000476ED820: 0060000000000001 0000001500000001 000000034000070D
> 000000037F177000
> >> > > 00000476ED840: 00000000476ED8F0 000000037FD5BCE0 0000000047FFA000
> 0000000000000000
> >> > 00000476ED860: 0000000047FFE000 0000000047FFF000 0000000000000000
> 0000007FFFFFFFFF
> >> > 00000476ED880: 000000037CAA0000 0000000000000001 0000000047717588
> 0000000047FFF000
> >> > 00000476ED8A0: 0000000380000000 000000037CAA0000 0060000000000000
> FF9F000000000F3F
> >> > 00000476ED8C0: 000000017FD605DD 000000037FD610D8 0060000000000001
> 0000001E00000001
> >> > 00000476ED8E0: 0060000000000000 000000037F179000 00000000476ED9A0
> 000000037FD5C054
> >> > 00000476ED900: 002000000000041C 0000000000000000 0000000047FFA000
> 0000000000004000
> >> > 00000476ED920: 0000000334AA6000 0000000047FFF000 000000037F17A238
> 0000000047717000
> >> > ASSERT [ArmCpuDxe]
> /builddir/build/BUILD/edk2-fff6d81270b5/ArmPkg/Library/DefaultExceptionHandlerLib/AArch64/DefaultExceptionHandler.c(333):
> ((BOOLEAN)(0==1))
> >> > ----------------------------------------------------------------
> >> > Thanks,
> >> > Oliver
> >> >
>
>
[-- Attachment #2: Type: text/html, Size: 12040 bytes --]
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
[not found] ` <173FFD60429C89C3.3213@groups.io>
@ 2023-02-07 10:51 ` Oliver Steffen
2023-02-07 11:56 ` Ard Biesheuvel
0 siblings, 1 reply; 37+ messages in thread
From: Oliver Steffen @ 2023-02-07 10:51 UTC (permalink / raw)
To: Ard Biesheuvel; +Cc: devel, Gerd Hoffmann, Marc Zyngier, dann.frazier
[-- Attachment #1: Type: text/plain, Size: 2120 bytes --]
On Thu, Feb 2, 2023 at 12:09 PM Oliver Steffen <osteffen@redhat.com> wrote:
>
> On Wed, Feb 1, 2023 at 2:29 PM Ard Biesheuvel <ardb@kernel.org> wrote:
>
>> On Wed, 1 Feb 2023 at 13:59, Oliver Steffen <osteffen@redhat.com> wrote:
>> >
>> > On Wed, Feb 1, 2023 at 12:52 PM Ard Biesheuvel <ardb@kernel.org> wrote:
>> >>
>> >> On Wed, 1 Feb 2023 at 10:14, Oliver Steffen <osteffen@redhat.com>
>> wrote:
>> >> >
>>
> [...]
>
>> >> > I am sorry, this story does not seem to be over yet.
>> >> >
>> >> > We are using the Erratum patch and also included the commit 406504c7
>> in
>> >> > the kernel.
>> >> > Now the firmware crashes sometimes (10 out of 89 tests).
>> >> >
>> >>
>> >> Thanks for the report. Is this still on ThunderX2?
>> >>
>> >> > Any hints are very welcome!
>> >> >
>> >>
>> >> Do you have access to those build artifacts?
>> >
>> >
>> >
>> https://kojihub.stream.centos.org/kojifiles/work/tasks/5251/1835251/edk2-aarch64-20221207gitfff6d81270b5-4.el9.test.noarch.rpm
>> >
>> > and/or here:
>> >
>> > https://kojihub.stream.centos.org/koji/taskinfo?taskID=1835251
>> >
>> > Source for reference:
>> > https://gitlab.com/redhat/centos-stream/src/edk2/-/merge_requests/24
>> >
>>
>> Any chance the .dll files (which are actually ELF executables) have
>> been preserved somewhere?
>>
> Here is the build folder (~90MB):
>
> https://gitlab.com/osteffen/thunderx2-debug/-/raw/main/armvirt-thunderx2-issue.tar.xz
>
> I am waiting for the tests with the additional debug output to run.
>
We reran the test suite with the Erratum and the additional debug
output enabled. Strangely, the problem does not occur anymore, the
firmware boots up normally.
We retried the tests without the additional debug output.
RHEL ships two firmware flavors for AARCH64: a silent and a verbose
version. Both were tried. We see no problems with the verbose
one. The silent one fails noticeably more often if a software TPM device
is present.
Could this be related to how much stuff is going on in the early phase
of the firmware (when logging is enabled: formatting of messages and
sending to serial port...) ?
Thanks,
Oliver
[-- Attachment #2: Type: text/html, Size: 4299 bytes --]
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-02-07 10:51 ` Oliver Steffen
@ 2023-02-07 11:56 ` Ard Biesheuvel
2023-02-07 12:58 ` Oliver Steffen
0 siblings, 1 reply; 37+ messages in thread
From: Ard Biesheuvel @ 2023-02-07 11:56 UTC (permalink / raw)
To: Oliver Steffen; +Cc: devel, Gerd Hoffmann, Marc Zyngier, dann.frazier
On Tue, 7 Feb 2023 at 11:51, Oliver Steffen <osteffen@redhat.com> wrote:
>
> On Thu, Feb 2, 2023 at 12:09 PM Oliver Steffen <osteffen@redhat.com> wrote:
>>
>>
>> On Wed, Feb 1, 2023 at 2:29 PM Ard Biesheuvel <ardb@kernel.org> wrote:
>>>
>>> On Wed, 1 Feb 2023 at 13:59, Oliver Steffen <osteffen@redhat.com> wrote:
>>> >
>>> > On Wed, Feb 1, 2023 at 12:52 PM Ard Biesheuvel <ardb@kernel.org> wrote:
>>> >>
>>> >> On Wed, 1 Feb 2023 at 10:14, Oliver Steffen <osteffen@redhat.com> wrote:
>>> >> >
>>
>> [...]
>>>
>>> >> > I am sorry, this story does not seem to be over yet.
>>> >> >
>>> >> > We are using the Erratum patch and also included the commit 406504c7 in
>>> >> > the kernel.
>>> >> > Now the firmware crashes sometimes (10 out of 89 tests).
>>> >> >
>>> >>
>>> >> Thanks for the report. Is this still on ThunderX2?
>>> >>
>>> >> > Any hints are very welcome!
>>> >> >
>>> >>
>>> >> Do you have access to those build artifacts?
>>> >
>>> >
>>> > https://kojihub.stream.centos.org/kojifiles/work/tasks/5251/1835251/edk2-aarch64-20221207gitfff6d81270b5-4.el9.test.noarch.rpm
>>> >
>>> > and/or here:
>>> >
>>> > https://kojihub.stream.centos.org/koji/taskinfo?taskID=1835251
>>> >
>>> > Source for reference:
>>> > https://gitlab.com/redhat/centos-stream/src/edk2/-/merge_requests/24
>>> >
>>>
>>> Any chance the .dll files (which are actually ELF executables) have
>>> been preserved somewhere?
>>
>> Here is the build folder (~90MB):
>> https://gitlab.com/osteffen/thunderx2-debug/-/raw/main/armvirt-thunderx2-issue.tar.xz
>>
>> I am waiting for the tests with the additional debug output to run.
>
>
> We reran the test suite with the Erratum and the additional debug
> output enabled. Strangely, the problem does not occur anymore, the
> firmware boots up normally.
>
> We retried the tests without the additional debug output.
> RHEL ships two firmware flavors for AARCH64: a silent and a verbose
> version.
Are these RELEASE vs DEBUG builds?
> Both were tried. We see no problems with the verbose
> one. The silent one fails noticeably more often if a software TPM device
> is present.
>
This smells like some missing cache or TLB maintenance - the verbose
one exits to the host much more often, and likely relies on cache/TLB
maintenance occurring in the hypervisor.
So the build always includes TPM support but the issue only occurs
when the sw TPM is actually exposed by QEMU?
> Could this be related to how much stuff is going on in the early phase
> of the firmware (when logging is enabled: formatting of messages and
> sending to serial port...) ?
>
I'll try to see if I can rig something up that logs into a buffer
rather than straight to the serial, and dump it all out when handling
the crash
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-02-07 11:56 ` Ard Biesheuvel
@ 2023-02-07 12:58 ` Oliver Steffen
2023-02-09 15:15 ` Ard Biesheuvel
0 siblings, 1 reply; 37+ messages in thread
From: Oliver Steffen @ 2023-02-07 12:58 UTC (permalink / raw)
To: Ard Biesheuvel; +Cc: devel, Gerd Hoffmann, Marc Zyngier, dann.frazier
[-- Attachment #1: Type: text/plain, Size: 3233 bytes --]
On Tue, Feb 7, 2023 at 12:57 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> On Tue, 7 Feb 2023 at 11:51, Oliver Steffen <osteffen@redhat.com> wrote:
> >
> > On Thu, Feb 2, 2023 at 12:09 PM Oliver Steffen <osteffen@redhat.com>
> wrote:
> >>
> >>
> >> On Wed, Feb 1, 2023 at 2:29 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> >>>
> >>> On Wed, 1 Feb 2023 at 13:59, Oliver Steffen <osteffen@redhat.com>
> wrote:
> >>> >
> >>> > On Wed, Feb 1, 2023 at 12:52 PM Ard Biesheuvel <ardb@kernel.org>
> wrote:
> >>> >>
> >>> >> On Wed, 1 Feb 2023 at 10:14, Oliver Steffen <osteffen@redhat.com>
> wrote:
> >>> >> >
> >>
> >> [...]
> >>>
> >>> >> > I am sorry, this story does not seem to be over yet.
> >>> >> >
> >>> >> > We are using the Erratum patch and also included the commit
> 406504c7 in
> >>> >> > the kernel.
> >>> >> > Now the firmware crashes sometimes (10 out of 89 tests).
> >>> >> >
> >>> >>
> >>> >> Thanks for the report. Is this still on ThunderX2?
> >>> >>
> >>> >> > Any hints are very welcome!
> >>> >> >
> >>> >>
> >>> >> Do you have access to those build artifacts?
> >>> >
> >>> >
> >>> >
> https://kojihub.stream.centos.org/kojifiles/work/tasks/5251/1835251/edk2-aarch64-20221207gitfff6d81270b5-4.el9.test.noarch.rpm
> >>> >
> >>> > and/or here:
> >>> >
> >>> > https://kojihub.stream.centos.org/koji/taskinfo?taskID=1835251
> >>> >
> >>> > Source for reference:
> >>> > https://gitlab.com/redhat/centos-stream/src/edk2/-/merge_requests/24
> >>> >
> >>>
> >>> Any chance the .dll files (which are actually ELF executables) have
> >>> been preserved somewhere?
> >>
> >> Here is the build folder (~90MB):
> >>
> https://gitlab.com/osteffen/thunderx2-debug/-/raw/main/armvirt-thunderx2-issue.tar.xz
> >>
> >> I am waiting for the tests with the additional debug output to run.
> >
> >
> > We reran the test suite with the Erratum and the additional debug
> > output enabled. Strangely, the problem does not occur anymore, the
> > firmware boots up normally.
> >
> > We retried the tests without the additional debug output.
> > RHEL ships two firmware flavors for AARCH64: a silent and a verbose
> > version.
>
> Are these RELEASE vs DEBUG builds?
>
All builds are DEBUG, just the amount of information printed on
the serial is different (almost zero for the "silent" one.)
> > Both were tried. We see no problems with the verbose
> > one. The silent one fails noticeably more often if a software TPM device
> > is present.
> >
>
> This smells like some missing cache or TLB maintenance - the verbose
> one exits to the host much more often, and likely relies on cache/TLB
> maintenance occurring in the hypervisor.
>
> So the build always includes TPM support but the issue only occurs
> when the sw TPM is actually exposed by QEMU?
>
Yes.
All builds include support for TPM, but the issue occurs more frequently
if a sw TPM is exposed by QEMU.
> > Could this be related to how much stuff is going on in the early phase
> > of the firmware (when logging is enabled: formatting of messages and
> > sending to serial port...) ?
> >
>
> I'll try to see if I can rig something up that logs into a buffer
> rather than straight to the serial, and dump it all out when handling
> the crash
>
> Awesome.
Thanks,
Oliver
[-- Attachment #2: Type: text/html, Size: 6233 bytes --]
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-02-07 12:58 ` Oliver Steffen
@ 2023-02-09 15:15 ` Ard Biesheuvel
2023-03-02 10:50 ` Ard Biesheuvel
0 siblings, 1 reply; 37+ messages in thread
From: Ard Biesheuvel @ 2023-02-09 15:15 UTC (permalink / raw)
To: Oliver Steffen; +Cc: devel, Gerd Hoffmann, Marc Zyngier, dann.frazier
On Tue, 7 Feb 2023 at 13:58, Oliver Steffen <osteffen@redhat.com> wrote:
>
> On Tue, Feb 7, 2023 at 12:57 PM Ard Biesheuvel <ardb@kernel.org> wrote:
>>
>> On Tue, 7 Feb 2023 at 11:51, Oliver Steffen <osteffen@redhat.com> wrote:
>> >
>> > On Thu, Feb 2, 2023 at 12:09 PM Oliver Steffen <osteffen@redhat.com> wrote:
>> >>
>> >>
>> >> On Wed, Feb 1, 2023 at 2:29 PM Ard Biesheuvel <ardb@kernel.org> wrote:
>> >>>
>> >>> On Wed, 1 Feb 2023 at 13:59, Oliver Steffen <osteffen@redhat.com> wrote:
>> >>> >
>> >>> > On Wed, Feb 1, 2023 at 12:52 PM Ard Biesheuvel <ardb@kernel.org> wrote:
>> >>> >>
>> >>> >> On Wed, 1 Feb 2023 at 10:14, Oliver Steffen <osteffen@redhat.com> wrote:
>> >>> >> >
>> >>
>> >> [...]
>> >>>
>> >>> >> > I am sorry, this story does not seem to be over yet.
>> >>> >> >
>> >>> >> > We are using the Erratum patch and also included the commit 406504c7 in
>> >>> >> > the kernel.
>> >>> >> > Now the firmware crashes sometimes (10 out of 89 tests).
>> >>> >> >
>> >>> >>
>> >>> >> Thanks for the report. Is this still on ThunderX2?
>> >>> >>
>> >>> >> > Any hints are very welcome!
>> >>> >> >
>> >>> >>
>> >>> >> Do you have access to those build artifacts?
>> >>> >
>> >>> >
>> >>> > https://kojihub.stream.centos.org/kojifiles/work/tasks/5251/1835251/edk2-aarch64-20221207gitfff6d81270b5-4.el9.test.noarch.rpm
>> >>> >
>> >>> > and/or here:
>> >>> >
>> >>> > https://kojihub.stream.centos.org/koji/taskinfo?taskID=1835251
>> >>> >
>> >>> > Source for reference:
>> >>> > https://gitlab.com/redhat/centos-stream/src/edk2/-/merge_requests/24
>> >>> >
>> >>>
>> >>> Any chance the .dll files (which are actually ELF executables) have
>> >>> been preserved somewhere?
>> >>
>> >> Here is the build folder (~90MB):
>> >> https://gitlab.com/osteffen/thunderx2-debug/-/raw/main/armvirt-thunderx2-issue.tar.xz
>> >>
>> >> I am waiting for the tests with the additional debug output to run.
>> >
>> >
>> > We reran the test suite with the Erratum and the additional debug
>> > output enabled. Strangely, the problem does not occur anymore, the
>> > firmware boots up normally.
>> >
>> > We retried the tests without the additional debug output.
>> > RHEL ships two firmware flavors for AARCH64: a silent and a verbose
>> > version.
>>
>> Are these RELEASE vs DEBUG builds?
>
>
> All builds are DEBUG, just the amount of information printed on
> the serial is different (almost zero for the "silent" one.)
>
>>
>> > Both were tried. We see no problems with the verbose
>> > one. The silent one fails noticeably more often if a software TPM device
>> > is present.
>> >
>>
>> This smells like some missing cache or TLB maintenance - the verbose
>> one exits to the host much more often, and likely relies on cache/TLB
>> maintenance occurring in the hypervisor.
>>
>> So the build always includes TPM support but the issue only occurs
>> when the sw TPM is actually exposed by QEMU?
>
>
> Yes.
> All builds include support for TPM, but the issue occurs more frequently
> if a sw TPM is exposed by QEMU.
>
Any chance you could provide a specific command line for launching
QEMU? I am trying to reproduce this, but I am not making any progress.
>>
>> > Could this be related to how much stuff is going on in the early phase
>> > of the firmware (when logging is enabled: formatting of messages and
>> > sending to serial port...) ?
>> >
>>
>> I'll try to see if I can rig something up that logs into a buffer
>> rather than straight to the serial, and dump it all out when handling
>> the crash
>>
This takes a bit more time than I can afford to spend on this atm, and
I'd like to be able to reproduce before I go down this rabbit hole.
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-02-09 15:15 ` Ard Biesheuvel
@ 2023-03-02 10:50 ` Ard Biesheuvel
2023-03-02 13:29 ` Oliver Steffen
[not found] ` <17489D498A098DB9.9697@groups.io>
0 siblings, 2 replies; 37+ messages in thread
From: Ard Biesheuvel @ 2023-03-02 10:50 UTC (permalink / raw)
To: Oliver Steffen; +Cc: devel, Gerd Hoffmann, Marc Zyngier, dann.frazier
On Thu, 9 Feb 2023 at 16:15, Ard Biesheuvel <ardb@kernel.org> wrote:
>
> On Tue, 7 Feb 2023 at 13:58, Oliver Steffen <osteffen@redhat.com> wrote:
> >
> > On Tue, Feb 7, 2023 at 12:57 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> >>
> >> On Tue, 7 Feb 2023 at 11:51, Oliver Steffen <osteffen@redhat.com> wrote:
> >> >
> >> > On Thu, Feb 2, 2023 at 12:09 PM Oliver Steffen <osteffen@redhat.com> wrote:
> >> >>
> >> >>
> >> >> On Wed, Feb 1, 2023 at 2:29 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> >> >>>
> >> >>> On Wed, 1 Feb 2023 at 13:59, Oliver Steffen <osteffen@redhat.com> wrote:
> >> >>> >
> >> >>> > On Wed, Feb 1, 2023 at 12:52 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> >> >>> >>
> >> >>> >> On Wed, 1 Feb 2023 at 10:14, Oliver Steffen <osteffen@redhat.com> wrote:
> >> >>> >> >
> >> >>
> >> >> [...]
> >> >>>
> >> >>> >> > I am sorry, this story does not seem to be over yet.
> >> >>> >> >
> >> >>> >> > We are using the Erratum patch and also included the commit 406504c7 in
> >> >>> >> > the kernel.
> >> >>> >> > Now the firmware crashes sometimes (10 out of 89 tests).
> >> >>> >> >
> >> >>> >>
> >> >>> >> Thanks for the report. Is this still on ThunderX2?
> >> >>> >>
> >> >>> >> > Any hints are very welcome!
> >> >>> >> >
> >> >>> >>
> >> >>> >> Do you have access to those build artifacts?
> >> >>> >
> >> >>> >
> >> >>> > https://kojihub.stream.centos.org/kojifiles/work/tasks/5251/1835251/edk2-aarch64-20221207gitfff6d81270b5-4.el9.test.noarch.rpm
> >> >>> >
> >> >>> > and/or here:
> >> >>> >
> >> >>> > https://kojihub.stream.centos.org/koji/taskinfo?taskID=1835251
> >> >>> >
> >> >>> > Source for reference:
> >> >>> > https://gitlab.com/redhat/centos-stream/src/edk2/-/merge_requests/24
> >> >>> >
> >> >>>
> >> >>> Any chance the .dll files (which are actually ELF executables) have
> >> >>> been preserved somewhere?
> >> >>
> >> >> Here is the build folder (~90MB):
> >> >> https://gitlab.com/osteffen/thunderx2-debug/-/raw/main/armvirt-thunderx2-issue.tar.xz
> >> >>
> >> >> I am waiting for the tests with the additional debug output to run.
> >> >
> >> >
> >> > We reran the test suite with the Erratum and the additional debug
> >> > output enabled. Strangely, the problem does not occur anymore, the
> >> > firmware boots up normally.
> >> >
> >> > We retried the tests without the additional debug output.
> >> > RHEL ships two firmware flavors for AARCH64: a silent and a verbose
> >> > version.
> >>
> >> Are these RELEASE vs DEBUG builds?
> >
> >
> > All builds are DEBUG, just the amount of information printed on
> > the serial is different (almost zero for the "silent" one.)
> >
> >>
> >> > Both were tried. We see no problems with the verbose
> >> > one. The silent one fails noticeably more often if a software TPM device
> >> > is present.
> >> >
> >>
> >> This smells like some missing cache or TLB maintenance - the verbose
> >> one exits to the host much more often, and likely relies on cache/TLB
> >> maintenance occurring in the hypervisor.
> >>
> >> So the build always includes TPM support but the issue only occurs
> >> when the sw TPM is actually exposed by QEMU?
> >
> >
> > Yes.
> > All builds include support for TPM, but the issue occurs more frequently
> > if a sw TPM is exposed by QEMU.
> >
>
> Any chance you could provide a specific command line for launching
> QEMU? I am trying to reproduce this, but I am not making any progress.
>
> >>
> >> > Could this be related to how much stuff is going on in the early phase
> >> > of the firmware (when logging is enabled: formatting of messages and
> >> > sending to serial port...) ?
> >> >
> >>
> >> I'll try to see if I can rig something up that logs into a buffer
> >> rather than straight to the serial, and dump it all out when handling
> >> the crash
> >>
>
> This takes a bit more time than I can afford to spend on this atm, and
> I'd like to be able to reproduce before I go down this rabbit hole.
Have there been any developments regarding this issue?
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-03-02 10:50 ` Ard Biesheuvel
@ 2023-03-02 13:29 ` Oliver Steffen
[not found] ` <17489D498A098DB9.9697@groups.io>
1 sibling, 0 replies; 37+ messages in thread
From: Oliver Steffen @ 2023-03-02 13:29 UTC (permalink / raw)
To: devel, ardb; +Cc: Gerd Hoffmann, Marc Zyngier, dann.frazier
[-- Attachment #1: Type: text/plain, Size: 4620 bytes --]
On Thu, Mar 2, 2023 at 11:50 AM Ard Biesheuvel <ardb@kernel.org> wrote:
> On Thu, 9 Feb 2023 at 16:15, Ard Biesheuvel <ardb@kernel.org> wrote:
> >
> > On Tue, 7 Feb 2023 at 13:58, Oliver Steffen <osteffen@redhat.com> wrote:
> > >
> > > On Tue, Feb 7, 2023 at 12:57 PM Ard Biesheuvel <ardb@kernel.org>
> wrote:
> > >>
> > >> On Tue, 7 Feb 2023 at 11:51, Oliver Steffen <osteffen@redhat.com>
> wrote:
> > >> >
> > >> > On Thu, Feb 2, 2023 at 12:09 PM Oliver Steffen <osteffen@redhat.com>
> wrote:
> > >> >>
> > >> >>
> > >> >> On Wed, Feb 1, 2023 at 2:29 PM Ard Biesheuvel <ardb@kernel.org>
> wrote:
> > >> >>>
> > >> >>> On Wed, 1 Feb 2023 at 13:59, Oliver Steffen <osteffen@redhat.com>
> wrote:
> > >> >>> >
> > >> >>> > On Wed, Feb 1, 2023 at 12:52 PM Ard Biesheuvel <ardb@kernel.org>
> wrote:
> > >> >>> >>
> > >> >>> >> On Wed, 1 Feb 2023 at 10:14, Oliver Steffen <
> osteffen@redhat.com> wrote:
> > >> >>> >> >
> > >> >>
> > >> >> [...]
> > >> >>>
> > >> >>> >> > I am sorry, this story does not seem to be over yet.
> > >> >>> >> >
> > >> >>> >> > We are using the Erratum patch and also included the commit
> 406504c7 in
> > >> >>> >> > the kernel.
> > >> >>> >> > Now the firmware crashes sometimes (10 out of 89 tests).
> > >> >>> >> >
> > >> >>> >>
> > >> >>> >> Thanks for the report. Is this still on ThunderX2?
> > >> >>> >>
> > >> >>> >> > Any hints are very welcome!
> > >> >>> >> >
> > >> >>> >>
> > >> >>> >> Do you have access to those build artifacts?
> > >> >>> >
> > >> >>> >
> > >> >>> >
> https://kojihub.stream.centos.org/kojifiles/work/tasks/5251/1835251/edk2-aarch64-20221207gitfff6d81270b5-4.el9.test.noarch.rpm
> > >> >>> >
> > >> >>> > and/or here:
> > >> >>> >
> > >> >>> > https://kojihub.stream.centos.org/koji/taskinfo?taskID=1835251
> > >> >>> >
> > >> >>> > Source for reference:
> > >> >>> >
> https://gitlab.com/redhat/centos-stream/src/edk2/-/merge_requests/24
> > >> >>> >
> > >> >>>
> > >> >>> Any chance the .dll files (which are actually ELF executables)
> have
> > >> >>> been preserved somewhere?
> > >> >>
> > >> >> Here is the build folder (~90MB):
> > >> >>
> https://gitlab.com/osteffen/thunderx2-debug/-/raw/main/armvirt-thunderx2-issue.tar.xz
> > >> >>
> > >> >> I am waiting for the tests with the additional debug output to run.
> > >> >
> > >> >
> > >> > We reran the test suite with the Erratum and the additional debug
> > >> > output enabled. Strangely, the problem does not occur anymore, the
> > >> > firmware boots up normally.
> > >> >
> > >> > We retried the tests without the additional debug output.
> > >> > RHEL ships two firmware flavors for AARCH64: a silent and a verbose
> > >> > version.
> > >>
> > >> Are these RELEASE vs DEBUG builds?
> > >
> > >
> > > All builds are DEBUG, just the amount of information printed on
> > > the serial is different (almost zero for the "silent" one.)
> > >
> > >>
> > >> > Both were tried. We see no problems with the verbose
> > >> > one. The silent one fails noticeably more often if a software TPM
> device
> > >> > is present.
> > >> >
> > >>
> > >> This smells like some missing cache or TLB maintenance - the verbose
> > >> one exits to the host much more often, and likely relies on cache/TLB
> > >> maintenance occurring in the hypervisor.
> > >>
> > >> So the build always includes TPM support but the issue only occurs
> > >> when the sw TPM is actually exposed by QEMU?
> > >
> > >
> > > Yes.
> > > All builds include support for TPM, but the issue occurs more
> frequently
> > > if a sw TPM is exposed by QEMU.
> > >
> >
> > Any chance you could provide a specific command line for launching
> > QEMU? I am trying to reproduce this, but I am not making any progress.
> >
> > >>
> > >> > Could this be related to how much stuff is going on in the early
> phase
> > >> > of the firmware (when logging is enabled: formatting of messages and
> > >> > sending to serial port...) ?
> > >> >
> > >>
> > >> I'll try to see if I can rig something up that logs into a buffer
> > >> rather than straight to the serial, and dump it all out when handling
> > >> the crash
> > >>
> >
> > This takes a bit more time than I can afford to spend on this atm, and
> > I'd like to be able to reproduce before I go down this rabbit hole.
>
> Have there been any developments regarding this issue?
>
Nothing from my side. I tried to come up with a more reliable/faster
reproducer
but then stopped because of other stuff.
If you have any idea what I could try next let me know.
-Oliver
[-- Attachment #2: Type: text/html, Size: 7956 bytes --]
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
[not found] ` <17489D498A098DB9.9697@groups.io>
@ 2023-05-19 16:32 ` Oliver Steffen
2023-05-19 21:36 ` Ard Biesheuvel
0 siblings, 1 reply; 37+ messages in thread
From: Oliver Steffen @ 2023-05-19 16:32 UTC (permalink / raw)
To: Oliver Steffen, ardb, devel; +Cc: Gerd Hoffmann, Marc Zyngier, dann.frazier
Quoting Oliver Steffen (2023-03-02 14:29:43)
> On Thu, Mar 2, 2023 at 11:50 AM Ard Biesheuvel <[1]ardb@kernel.org> wrote:
>
> On Thu, 9 Feb 2023 at 16:15, Ard Biesheuvel <[2]ardb@kernel.org> wrote:
> >
> > On Tue, 7 Feb 2023 at 13:58, Oliver Steffen <[3]osteffen@redhat.com>
> wrote:
> > >
> > > On Tue, Feb 7, 2023 at 12:57 PM Ard Biesheuvel <[4]ardb@kernel.org>
> wrote:
> > >>
> > >> On Tue, 7 Feb 2023 at 11:51, Oliver Steffen <[5]osteffen@redhat.com>
> wrote:
> > >> >
> > >> > On Thu, Feb 2, 2023 at 12:09 PM Oliver Steffen <[6]
> osteffen@redhat.com> wrote:
> > >> >>
> > >> >>
> > >> >> On Wed, Feb 1, 2023 at 2:29 PM Ard Biesheuvel <[7]ardb@kernel.org>
> wrote:
> > >> >>>
> > >> >>> On Wed, 1 Feb 2023 at 13:59, Oliver Steffen <[8]
> osteffen@redhat.com> wrote:
> > >> >>> >
> > >> >>> > On Wed, Feb 1, 2023 at 12:52 PM Ard Biesheuvel <[9]
> ardb@kernel.org> wrote:
> > >> >>> >>
> > >> >>> >> On Wed, 1 Feb 2023 at 10:14, Oliver Steffen <[10]
> osteffen@redhat.com> wrote:
> > >> >>> >> >
> > >> >>
> > >> >> [...]
> > >> >>>
> > >> >>> >> > I am sorry, this story does not seem to be over yet.
> > >> >>> >> >
> > >> >>> >> > We are using the Erratum patch and also included the commit
> 406504c7 in
> > >> >>> >> > the kernel.
> > >> >>> >> > Now the firmware crashes sometimes (10 out of 89 tests).
> > >> >>> >> >
> > >> >>> >>
> > >> >>> >> Thanks for the report. Is this still on ThunderX2?
> > >> >>> >>
> > >> >>> >> > Any hints are very welcome!
> > >> >>> >> >
> > >> >>> >>
> > >> >>> >> Do you have access to those build artifacts?
> > >> >>> >
> > >> >>> >
> > >> >>> > [11]https://kojihub.stream.centos.org/kojifiles/work/tasks/5251/
> 1835251/edk2-aarch64-20221207gitfff6d81270b5-4.el9.test.noarch.rpm
> > >> >>> >
> > >> >>> > and/or here:
> > >> >>> >
> > >> >>> > [12]https://kojihub.stream.centos.org/koji/taskinfo?taskID=
> 1835251
> > >> >>> >
> > >> >>> > Source for reference:
> > >> >>> > [13]https://gitlab.com/redhat/centos-stream/src/edk2/-/
> merge_requests/24
> > >> >>> >
> > >> >>>
> > >> >>> Any chance the .dll files (which are actually ELF executables)
> have
> > >> >>> been preserved somewhere?
> > >> >>
> > >> >> Here is the build folder (~90MB):
> > >> >> [14]https://gitlab.com/osteffen/thunderx2-debug/-/raw/main/
> armvirt-thunderx2-issue.tar.xz
> > >> >>
> > >> >> I am waiting for the tests with the additional debug output to run.
> > >> >
> > >> >
> > >> > We reran the test suite with the Erratum and the additional debug
> > >> > output enabled. Strangely, the problem does not occur anymore, the
> > >> > firmware boots up normally.
> > >> >
> > >> > We retried the tests without the additional debug output.
> > >> > RHEL ships two firmware flavors for AARCH64: a silent and a verbose
> > >> > version.
> > >>
> > >> Are these RELEASE vs DEBUG builds?
> > >
> > >
> > > All builds are DEBUG, just the amount of information printed on
> > > the serial is different (almost zero for the "silent" one.)
> > >
> > >>
> > >> > Both were tried. We see no problems with the verbose
> > >> > one. The silent one fails noticeably more often if a software TPM
> device
> > >> > is present.
> > >> >
> > >>
> > >> This smells like some missing cache or TLB maintenance - the verbose
> > >> one exits to the host much more often, and likely relies on cache/TLB
> > >> maintenance occurring in the hypervisor.
> > >>
> > >> So the build always includes TPM support but the issue only occurs
> > >> when the sw TPM is actually exposed by QEMU?
> > >
> > >
> > > Yes.
> > > All builds include support for TPM, but the issue occurs more
> frequently
> > > if a sw TPM is exposed by QEMU.
> > >
> >
> > Any chance you could provide a specific command line for launching
> > QEMU? I am trying to reproduce this, but I am not making any progress.
> >
> > >>
> > >> > Could this be related to how much stuff is going on in the early
> phase
> > >> > of the firmware (when logging is enabled: formatting of messages and
> > >> > sending to serial port...) ?
> > >> >
> > >>
> > >> I'll try to see if I can rig something up that logs into a buffer
> > >> rather than straight to the serial, and dump it all out when handling
> > >> the crash
> > >>
> >
> > This takes a bit more time than I can afford to spend on this atm, and
> > I'd like to be able to reproduce before I go down this rabbit hole.
>
> Have there been any developments regarding this issue?
>
>
> Nothing from my side. I tried to come up with a more reliable/faster
> reproducer
> but then stopped because of other stuff.
>
> If you have any idea what I could try next let me know.
>
> -Oliver
# Summary for Email 2
Hi all,
I had another look at this and I can now reproduce the issue consistently,
with a quite minimal setup, on recent Linux kernel, Qemu, and EDK2.
It requires rebooting the guest in a tight loop. It happens in silent
and verbose
builds alike, but since the verbose ones are slowed down by the serial
output, it
takes longer to hit the issue.
It is possible to reproduce it with the silent builds within a few minutes.
For the verbose case I recommend running multiple Qemu instances in parallel (as
many as the machine allows, in my case ~100).
Details:
CPU: Cavium ThunderX2(R) CPU CN9975
Tested on 3 different machines:
HPE apache, HPE apollo, Gigabyte R181
Kernels tested:
- 6.2.15-100.fc36.aarch64
- 5.14.0-312.el9.aarch64
(contains 406504c7b0405d74d74c15a667cd4c4620c3e7a9,
"KVM: arm64: Fix S1PTW handling on RO memslots")
Qemu v8.0.0 (RHEL version and build from upstream repo)
EDK2: master branch from 2023-05-16 (cafb4f3f)
gcc 11.3.1
EDK2 build command line:
build \
-a AARCH64
-p ArmVirtPkg/ArmVirtQemu.dsc
-t GCC5 -b DEBUG \
-D NETWORK_IP6_ENABLE \
-D NETWORK_HTTP_BOOT_ENABLE \
-D NETWORK_TLS_ENABLE \
-D NETWORK_ISCSI_ENABLE \
-D NETWORK_ALLOW_HTTP_CONNECTIONS \
-D CAVIUM_ERRATUM_27456=TRUE \
-D TPM2_ENABLE=TRUE \
-D TPM1_ENABLE=FALSE \
-D DEBUG_PRINT_ERROR_LEVEL=0x80000000 \
-D BUILD_SHELL=TRUE \
--pcd="gEfiShellPkgTokenSpaceGuid.PcdShellDefaultDelay=0" \
--pcd="gEfiMdePkgTokenSpaceGuid.PcdPlatformBootTimeOut=0" \
--hash --cmd-len=65536
To reproduce the issue I launched the firmware in Qemu and have it do
a reboot once it finished booting up
via a startup.nsh on the ESP.
Qemu command line:
qemu-system-aarch64 \
-machine virt,accel=kvm -m 13G \
-boot menu=off \
-cpu host \
-blockdev node-name=code,driver=file,filename="${FW_CODE}",read-only=on \
-blockdev node-name=vars,driver=file,filename="${FW_VARS}" \
-machine pflash0=code \
-machine pflash1=vars \
-serial stdio \
-net none \
-drive file=esp.img,snapshot=on
Other things like number of CPUs or the presence of a vTPM have no
influence. I did not try different amounts of RAM yet.
Serial output:
[...]
InitializeDxeNxMemoryProtectionPolicy: StackBase = 0x00000000476C5000
StackSize = 0x0000000000020000
InitializeDxeNxMemoryProtectionPolicy: applying strict permissions to
active memory regions
SetUefiImageMemoryAttributes - 0x0000000040000000 - 0x00000000076E5000
(0x0000000000004000)
UpdateRegionMappingRecursive(0): 40000000 - 476E5000 set
60000000000400 clr FF9F000000000B3F
UpdateRegionMappingRecursive(1): 40000000 - 476E5000 set
60000000000400 clr FF9F000000000B3F
UpdateRegionMappingRecursive(2): 40000000 - 476E5000 set
60000000000400 clr FF9F000000000B3F
UpdateRegionMappingRecursive(3): 47600000 - 476E5000 set
60000000000400 clr FF9F000000000B3F
SetUefiImageMemoryAttributes - 0x00000000476C5000 - 0x0000000000001000
(0x0000000000006000)
UpdateRegionMappingRecursive(0): 476C5000 - 476C6000 set
60000000000000 clr FF9F000000000B3F
UpdateRegionMappingRecursive(1): 476C5000 - 476C6000 set
60000000000000 clr FF9F000000000B3F
UpdateRegionMappingRecursive(2): 476C5000 - 476C6000 set
60000000000000 clr FF9F000000000B3F
UpdateRegionMappingRecursive(3): 476C5000 - 476C6000 set
60000000000000 clr FF9F000000000B3F
SetUefiImageMemoryAttributes - 0x000000004772B000 - 0x00000000007C0000
(0x0000000000004000)
UpdateRegionMappingRecursive(0): 4772B000 - 47EEB000 set
60000000000400 clr FF9F000000000B3F
UpdateRegionMappingRecursive(1): 4772B000 - 47EEB000 set
60000000000400 clr FF9F000000000B3F
UpdateRegionMappingRecursive(2): 4772B000 - 47EEB000 set
60000000000400 clr FF9F000000000B3F
UpdateRegionMappingRecursive(3): 4772B000 - 47800000 set
60000000000400 clr FF9F000000000B3F
UpdateRegionMappingRecursive(3): 47E00000 - 47EEB000 set
60000000000400 clr FF9F000000000B3F
SetUefiImageMemoryAttributes - 0x0000000047EF3000 - 0x0000000000101000
(0x0000000000004000)
UpdateRegionMappingRecursive(0): 47EF3000 - 47FF4000 set
60000000000400 clr FF9F000000000B3F
UpdateRegionMappingRecursive(1): 47EF3000 - 47FF4000 set
60000000000400 clr FF9F000000000B3F
UpdateRegionMappingRecursive(2): 47EF3000 - 47FF4000 set
60000000000400 clr FF9F000000000B3F
UpdateRegionMappingRecursive(3): 47EF3000 - 47FF4000 set
60000000000400 clr FF9F000000000B3F
SetUefiImageMemoryAttributes - 0x0000000047FFA000 - 0x0000000334AA6000
(0x0000000000004000)
UpdateRegionMappingRecursive(0): 47FFA000 - 37CAA0000 set
60000000000400 clr FF9F000000000B3F
UpdateRegionMappingRecursive(1): 47FFA000 - 37CAA0000 set
60000000000400 clr FF9F000000000B3F
UpdateRegionMappingRecursive(2): 47FFA000 - 80000000 set
60000000000400 clr FF9F000000000B3F
UpdateRegionMappingRecursive(3): 47FFA000 - 48000000 set
60000000000400 clr FF9F000000000B3F
UpdateRegionMappingRecursive(2): 340000000 - 380000000 set 70C clr 0
UpdateRegionMappingRecursive(3): 37F000000 - 37F200000 set 70C clr 0
UpdateRegionMappingRecursive(2): 340000000 - 37CAA0000 set
60000000000400 clr FF9F000000000B3F
UpdateRegionMappingRecursive(3): 37CA00000 - 37CC00000 set 70C clr 0
UpdateRegionMappingRecursive(3): 37CA00000 - 37CAA0000 set
60000000000400 clr FF9F000000000B3F
SetUefiImageMemoryAttributes - 0x000000037CB40000 - 0x00000000031F9000
(0x0000000000004000)
UpdateRegionMappingRecursive(0): 37CB40000 - 37FD39000 set
60000000000400 clr FF9F000000000B3F
UpdateRegionMappingRecursive(1): 37CB40000 - 37FD39000 set
60000000000400 clr FF9F000000000B3F
UpdateRegionMappingRecursive(2): 37CB40000 - 37FD39000 set
60000000000400 clr FF9F000000000B3F
UpdateRegionMappingRecursive(3): 37CB40000 - 37CC00000 set
60000000000400 clr FF9F000000000B3F
UpdateRegionMappingRecursive(3): 37F000000 - 37F200000 set
60000000000400 clr FF9F000000000B3F
UpdateRegionMappingRecursive(3): 37FC00000 - 37FE00000 set 70C clr 0
UpdateRegionMappingRecursive(3): 37FC00000 - 37FD39000 set
60000000000400 clr FF9F000000000B3F
Synchronous Exception at 0x000000037FD3C0A8
PC 0x00037FD3C0A8 (0x00037FD39000+0x000030A8) [ 0] ArmCpuDxe.dll
PC 0x00037FD3C0A8 (0x00037FD39000+0x000030A8) [ 0] ArmCpuDxe.dll
PC 0x00037FD3BE70 (0x00037FD39000+0x00002E70) [ 0] ArmCpuDxe.dll
PC 0x00037FD3BE70 (0x00037FD39000+0x00002E70) [ 0] ArmCpuDxe.dll
PC 0x00037FD3C2E4 (0x00037FD39000+0x000032E4) [ 0] ArmCpuDxe.dll
PC 0x0000476E78F8 (0x0000476E5000+0x000028F8) [ 1] DxeCore.dll
PC 0x0000476ED680 (0x0000476E5000+0x00008680) [ 1] DxeCore.dll
PC 0x0000476F2744 (0x0000476E5000+0x0000D744) [ 1] DxeCore.dll
PC 0x0000476ECDE8 (0x0000476E5000+0x00007DE8) [ 1] DxeCore.dll
PC 0x00037FD3D2DC (0x00037FD39000+0x000042DC) [ 2] ArmCpuDxe.dll
PC 0x0000476EC788 (0x0000476E5000+0x00007788) [ 3] DxeCore.dll
PC 0x0000476F9CA8 (0x0000476E5000+0x00014CA8) [ 3] DxeCore.dll
PC 0x0000476EFEF0 (0x0000476E5000+0x0000AEF0) [ 3] DxeCore.dll
[ 0] /root/edk2/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/ArmPkg/Drivers/CpuDxe/CpuDxe/DEBUG/ArmCpuDxe.dll
[ 1] /root/edk2/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/MdeModulePkg/Core/Dxe/DxeMain/DEBUG/DxeCore.dll
[ 2] /root/edk2/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/ArmPkg/Drivers/CpuDxe/CpuDxe/DEBUG/ArmCpuDxe.dll
[ 3] /root/edk2/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/MdeModulePkg/Core/Dxe/DxeMain/DEBUG/DxeCore.dll
X0 0x000000037F10BFF0 X1 0x000000037F106003 X2
0x000000000037FC00 X3 0x0000000000000000
X4 0x0000000000000200 X5 0x0000000000000004 X6
0x0000000000000000 X7 0x000000037FD3F4B5
X8 0x0000000000000000 X9 0x0000000000000002 X10
0x0000000000000000 X11 0x0000000000000000
X12 0x0000000000000002 X13 0x0000000000000002 X14
0x0000000000000001 X15 0x0000000000000002
X16 0x000000037FD3A268 X17 0x00000000007AFA10 X18
0x0000000000000000 X19 0x000000037FC00000
X20 0x0000000000000002 X21 0x000000037F106003 X22
0x000000037F10B000 X23 0x000000037FD42000
X24 0x00000000001FFFFF X25 0x000000037FD39000 X26
0x000000037F106000 X27 0x0000000000000003
X28 0x000000037F10BFF0 FP 0x00000000476E4780 LR 0x000000037FD3C0A8
V0 0x0000000000000000 0000000000000000 V1 0x0000000000000000
0000000000000000
V2 0x0000000000000000 0000000000000000 V3 0x0000000000000000
0000000000000000
V4 0x0000000000000000 0000000000000000 V5 0x0000000000000000
0000000000000000
V6 0x0000000000000000 0000000000000000 V7 0x0000000000000000
0000000000000000
V8 0x0000000000000000 0000000000000000 V9 0x0000000000000000
0000000000000000
V10 0x0000000000000000 0000000000000000 V11 0x0000000000000000
0000000000000000
V12 0x0000000000000000 0000000000000000 V13 0x0000000000000000
0000000000000000
V14 0x0000000000000000 0000000000000000 V15 0x0000000000000000
0000000000000000
V16 0x0000000000000000 0000000000000000 V17 0x0000000000000000
0000000000000000
V18 0x0000000000000000 0000000000000000 V19 0x0000000000000000
0000000000000000
V20 0x0000000000000000 0000000000000000 V21 0x0000000000000000
0000000000000000
V22 0x0000000000000000 0000000000000000 V23 0x0000000000000000
0000000000000000
V24 0x0000000000000000 0000000000000000 V25 0x0000000000000000
0000000000000000
V26 0x0000000000000000 0000000000000000 V27 0x0000000000000000
0000000000000000
V28 0x0000000000000000 0000000000000000 V29 0x0000000000000000
0000000000000000
V30 0x0000000000000000 0000000000000000 V31 0x0000000000000000
0000000000000000
SP 0x00000000476E4780 ELR 0x000000037FD3C0A8 SPSR 0x80000205 FPSR
0x00000000
ESR 0x86000006 FAR 0x000000037FD3C0A8
ESR : EC 0x21 IL 0x1 ISS 0x00000006
Instruction abort: Translation fault, second level
Stack dump:
00000476E4680: 0000000000000001 0000000000000004 00000000476E4700
00000000476F3980
00000476E46A0: 000000037FD40CBD 0000000000000003 000000037FC00000
000000037FD39000
00000476E46C0: 0060000000000400 FF9F000000000B3F 00000000476E4780
000000037FD3BE70
00000476E46E0: 000000037FC00000 0000000000000002 000000037F106000
000000037F10B000
00000476E4700: 0000000000000FF0 00000000001FFFFF 000000037FD39000
000000037F106000
00000476E4720: 0000000000000003 000000037F10BFF0 0060000000000400
FF9F000000000B3F
00000476E4740: 000000037FD39000 000000037FD39000 00000000476E4780
0060000000000403
00000476E4760: 0000000C00000001 000000037FD3F90E 0000000000000400
000000037F10B000
> 00000476E4780: 00000000476E4830 000000037FD3BE70 000000037CB40000 0000000000000001
00000476E47A0: 000000037F10B000 0000000047FFE000 0000000000000068
000000003FFFFFFF
00000476E47C0: 000000037FD39000 000000037F10C528 0000000000000002
0000000047FFE068
00000476E47E0: 0060000000000400 FF9F000000000B3F 0000000300000001
000000037FD39000
00000476E4800: 000000017FD40CBD 0060000000000401 0000001500000001
000000037FD3F90E
00000476E4820: 0060000000000400 000000037F106000 00000000476E48E0
000000037FD3BE70
00000476E4840: 000000037CB40000 0000000000000000 0000000047FFE000
0000000047FFF000
00000476E4860: 0000000000000000 0000007FFFFFFFFF 000000037FD39000
000000037F10C528
ASSERT [ArmCpuDxe]
/root/edk2/ArmPkg/Library/DefaultExceptionHandlerLib/AArch64/DefaultExceptionHandler.c(333):
((BOOLEAN)(0==1))
The full log is available here:
https://gitlab.com/osteffen/thunderx2-debug/-/raw/main/2023-05-19/85.log?inline=false
Debug files, firmware binaries, and the full build tree are here:
https://gitlab.com/osteffen/thunderx2-debug/-/tree/main/2023-05-19
I am able to reproduce this quickly, so any ideas for what I can try
are welcome :-)
Thanks
-Oliver
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-05-19 16:32 ` Oliver Steffen
@ 2023-05-19 21:36 ` Ard Biesheuvel
2023-05-20 8:37 ` Oliver Steffen
0 siblings, 1 reply; 37+ messages in thread
From: Ard Biesheuvel @ 2023-05-19 21:36 UTC (permalink / raw)
To: Oliver Steffen; +Cc: devel, Gerd Hoffmann, Marc Zyngier, dann.frazier
On Fri, 19 May 2023 at 18:32, Oliver Steffen <osteffen@redhat.com> wrote:
>
>
> Hi all,
>
> I had another look at this and I can now reproduce the issue consistently,
> with a quite minimal setup, on recent Linux kernel, Qemu, and EDK2.
> It requires rebooting the guest in a tight loop. It happens in silent
> and verbose
> builds alike, but since the verbose ones are slowed down by the serial
> output, it
> takes longer to hit the issue.
> It is possible to reproduce it with the silent builds within a few minutes.
> For the verbose case I recommend running multiple Qemu instances in parallel (as
> many as the machine allows, in my case ~100).
>
Thanks a lot for all these details, this is extremely helpful.
So what appears to be happening is that we split the 2M block mapping
that covers the code that we were called from, and hit a level 2
translation fault because the updated page table entry is still
observed to be in its transient 'invalid' state as we return to it.
Could you please check whether this makes a difference?
--- a/ArmPkg/Library/ArmMmuLib/AArch64/ArmMmuLibReplaceEntry.S
+++ b/ArmPkg/Library/ArmMmuLib/AArch64/ArmMmuLibReplaceEntry.S
@@ -65,6 +65,7 @@
// write updated entry
str x1, [x0]
dsb nshst
+ isb
.L2_\@:
.endm
> Details:
>
> CPU: Cavium ThunderX2(R) CPU CN9975
> Tested on 3 different machines:
> HPE apache, HPE apollo, Gigabyte R181
> Kernels tested:
> - 6.2.15-100.fc36.aarch64
> - 5.14.0-312.el9.aarch64
> (contains 406504c7b0405d74d74c15a667cd4c4620c3e7a9,
> "KVM: arm64: Fix S1PTW handling on RO memslots")
> Qemu v8.0.0 (RHEL version and build from upstream repo)
> EDK2: master branch from 2023-05-16 (cafb4f3f)
> gcc 11.3.1
>
> EDK2 build command line:
> build \
> -a AARCH64
> -p ArmVirtPkg/ArmVirtQemu.dsc
> -t GCC5 -b DEBUG \
> -D NETWORK_IP6_ENABLE \
> -D NETWORK_HTTP_BOOT_ENABLE \
> -D NETWORK_TLS_ENABLE \
> -D NETWORK_ISCSI_ENABLE \
> -D NETWORK_ALLOW_HTTP_CONNECTIONS \
> -D CAVIUM_ERRATUM_27456=TRUE \
> -D TPM2_ENABLE=TRUE \
> -D TPM1_ENABLE=FALSE \
> -D DEBUG_PRINT_ERROR_LEVEL=0x80000000 \
> -D BUILD_SHELL=TRUE \
> --pcd="gEfiShellPkgTokenSpaceGuid.PcdShellDefaultDelay=0" \
> --pcd="gEfiMdePkgTokenSpaceGuid.PcdPlatformBootTimeOut=0" \
> --hash --cmd-len=65536
>
> To reproduce the issue I launched the firmware in Qemu and have it do
> a reboot once it finished booting up
> via a startup.nsh on the ESP.
>
> Qemu command line:
> qemu-system-aarch64 \
> -machine virt,accel=kvm -m 13G \
> -boot menu=off \
> -cpu host \
> -blockdev node-name=code,driver=file,filename="${FW_CODE}",read-only=on \
> -blockdev node-name=vars,driver=file,filename="${FW_VARS}" \
> -machine pflash0=code \
> -machine pflash1=vars \
> -serial stdio \
> -net none \
> -drive file=esp.img,snapshot=on
>
> Other things like number of CPUs or the presence of a vTPM have no
> influence. I did not try different amounts of RAM yet.
>
> Serial output:
> [...]
> InitializeDxeNxMemoryProtectionPolicy: StackBase = 0x00000000476C5000
> StackSize = 0x0000000000020000
> InitializeDxeNxMemoryProtectionPolicy: applying strict permissions to
> active memory regions
> SetUefiImageMemoryAttributes - 0x0000000040000000 - 0x00000000076E5000
> (0x0000000000004000)
> UpdateRegionMappingRecursive(0): 40000000 - 476E5000 set
> 60000000000400 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(1): 40000000 - 476E5000 set
> 60000000000400 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(2): 40000000 - 476E5000 set
> 60000000000400 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(3): 47600000 - 476E5000 set
> 60000000000400 clr FF9F000000000B3F
> SetUefiImageMemoryAttributes - 0x00000000476C5000 - 0x0000000000001000
> (0x0000000000006000)
> UpdateRegionMappingRecursive(0): 476C5000 - 476C6000 set
> 60000000000000 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(1): 476C5000 - 476C6000 set
> 60000000000000 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(2): 476C5000 - 476C6000 set
> 60000000000000 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(3): 476C5000 - 476C6000 set
> 60000000000000 clr FF9F000000000B3F
> SetUefiImageMemoryAttributes - 0x000000004772B000 - 0x00000000007C0000
> (0x0000000000004000)
> UpdateRegionMappingRecursive(0): 4772B000 - 47EEB000 set
> 60000000000400 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(1): 4772B000 - 47EEB000 set
> 60000000000400 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(2): 4772B000 - 47EEB000 set
> 60000000000400 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(3): 4772B000 - 47800000 set
> 60000000000400 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(3): 47E00000 - 47EEB000 set
> 60000000000400 clr FF9F000000000B3F
> SetUefiImageMemoryAttributes - 0x0000000047EF3000 - 0x0000000000101000
> (0x0000000000004000)
> UpdateRegionMappingRecursive(0): 47EF3000 - 47FF4000 set
> 60000000000400 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(1): 47EF3000 - 47FF4000 set
> 60000000000400 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(2): 47EF3000 - 47FF4000 set
> 60000000000400 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(3): 47EF3000 - 47FF4000 set
> 60000000000400 clr FF9F000000000B3F
> SetUefiImageMemoryAttributes - 0x0000000047FFA000 - 0x0000000334AA6000
> (0x0000000000004000)
> UpdateRegionMappingRecursive(0): 47FFA000 - 37CAA0000 set
> 60000000000400 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(1): 47FFA000 - 37CAA0000 set
> 60000000000400 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(2): 47FFA000 - 80000000 set
> 60000000000400 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(3): 47FFA000 - 48000000 set
> 60000000000400 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(2): 340000000 - 380000000 set 70C clr 0
> UpdateRegionMappingRecursive(3): 37F000000 - 37F200000 set 70C clr 0
> UpdateRegionMappingRecursive(2): 340000000 - 37CAA0000 set
> 60000000000400 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(3): 37CA00000 - 37CC00000 set 70C clr 0
> UpdateRegionMappingRecursive(3): 37CA00000 - 37CAA0000 set
> 60000000000400 clr FF9F000000000B3F
> SetUefiImageMemoryAttributes - 0x000000037CB40000 - 0x00000000031F9000
> (0x0000000000004000)
> UpdateRegionMappingRecursive(0): 37CB40000 - 37FD39000 set
> 60000000000400 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(1): 37CB40000 - 37FD39000 set
> 60000000000400 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(2): 37CB40000 - 37FD39000 set
> 60000000000400 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(3): 37CB40000 - 37CC00000 set
> 60000000000400 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(3): 37F000000 - 37F200000 set
> 60000000000400 clr FF9F000000000B3F
> UpdateRegionMappingRecursive(3): 37FC00000 - 37FE00000 set 70C clr 0
> UpdateRegionMappingRecursive(3): 37FC00000 - 37FD39000 set
> 60000000000400 clr FF9F000000000B3F
>
>
> Synchronous Exception at 0x000000037FD3C0A8
> PC 0x00037FD3C0A8 (0x00037FD39000+0x000030A8) [ 0] ArmCpuDxe.dll
> PC 0x00037FD3C0A8 (0x00037FD39000+0x000030A8) [ 0] ArmCpuDxe.dll
> PC 0x00037FD3BE70 (0x00037FD39000+0x00002E70) [ 0] ArmCpuDxe.dll
> PC 0x00037FD3BE70 (0x00037FD39000+0x00002E70) [ 0] ArmCpuDxe.dll
> PC 0x00037FD3C2E4 (0x00037FD39000+0x000032E4) [ 0] ArmCpuDxe.dll
> PC 0x0000476E78F8 (0x0000476E5000+0x000028F8) [ 1] DxeCore.dll
> PC 0x0000476ED680 (0x0000476E5000+0x00008680) [ 1] DxeCore.dll
> PC 0x0000476F2744 (0x0000476E5000+0x0000D744) [ 1] DxeCore.dll
> PC 0x0000476ECDE8 (0x0000476E5000+0x00007DE8) [ 1] DxeCore.dll
> PC 0x00037FD3D2DC (0x00037FD39000+0x000042DC) [ 2] ArmCpuDxe.dll
> PC 0x0000476EC788 (0x0000476E5000+0x00007788) [ 3] DxeCore.dll
> PC 0x0000476F9CA8 (0x0000476E5000+0x00014CA8) [ 3] DxeCore.dll
> PC 0x0000476EFEF0 (0x0000476E5000+0x0000AEF0) [ 3] DxeCore.dll
>
> [ 0] /root/edk2/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/ArmPkg/Drivers/CpuDxe/CpuDxe/DEBUG/ArmCpuDxe.dll
> [ 1] /root/edk2/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/MdeModulePkg/Core/Dxe/DxeMain/DEBUG/DxeCore.dll
> [ 2] /root/edk2/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/ArmPkg/Drivers/CpuDxe/CpuDxe/DEBUG/ArmCpuDxe.dll
> [ 3] /root/edk2/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/MdeModulePkg/Core/Dxe/DxeMain/DEBUG/DxeCore.dll
>
> X0 0x000000037F10BFF0 X1 0x000000037F106003 X2
> 0x000000000037FC00 X3 0x0000000000000000
> X4 0x0000000000000200 X5 0x0000000000000004 X6
> 0x0000000000000000 X7 0x000000037FD3F4B5
> X8 0x0000000000000000 X9 0x0000000000000002 X10
> 0x0000000000000000 X11 0x0000000000000000
> X12 0x0000000000000002 X13 0x0000000000000002 X14
> 0x0000000000000001 X15 0x0000000000000002
> X16 0x000000037FD3A268 X17 0x00000000007AFA10 X18
> 0x0000000000000000 X19 0x000000037FC00000
> X20 0x0000000000000002 X21 0x000000037F106003 X22
> 0x000000037F10B000 X23 0x000000037FD42000
> X24 0x00000000001FFFFF X25 0x000000037FD39000 X26
> 0x000000037F106000 X27 0x0000000000000003
> X28 0x000000037F10BFF0 FP 0x00000000476E4780 LR 0x000000037FD3C0A8
>
> V0 0x0000000000000000 0000000000000000 V1 0x0000000000000000
> 0000000000000000
> V2 0x0000000000000000 0000000000000000 V3 0x0000000000000000
> 0000000000000000
> V4 0x0000000000000000 0000000000000000 V5 0x0000000000000000
> 0000000000000000
> V6 0x0000000000000000 0000000000000000 V7 0x0000000000000000
> 0000000000000000
> V8 0x0000000000000000 0000000000000000 V9 0x0000000000000000
> 0000000000000000
> V10 0x0000000000000000 0000000000000000 V11 0x0000000000000000
> 0000000000000000
> V12 0x0000000000000000 0000000000000000 V13 0x0000000000000000
> 0000000000000000
> V14 0x0000000000000000 0000000000000000 V15 0x0000000000000000
> 0000000000000000
> V16 0x0000000000000000 0000000000000000 V17 0x0000000000000000
> 0000000000000000
> V18 0x0000000000000000 0000000000000000 V19 0x0000000000000000
> 0000000000000000
> V20 0x0000000000000000 0000000000000000 V21 0x0000000000000000
> 0000000000000000
> V22 0x0000000000000000 0000000000000000 V23 0x0000000000000000
> 0000000000000000
> V24 0x0000000000000000 0000000000000000 V25 0x0000000000000000
> 0000000000000000
> V26 0x0000000000000000 0000000000000000 V27 0x0000000000000000
> 0000000000000000
> V28 0x0000000000000000 0000000000000000 V29 0x0000000000000000
> 0000000000000000
> V30 0x0000000000000000 0000000000000000 V31 0x0000000000000000
> 0000000000000000
>
> SP 0x00000000476E4780 ELR 0x000000037FD3C0A8 SPSR 0x80000205 FPSR
> 0x00000000
> ESR 0x86000006 FAR 0x000000037FD3C0A8
>
> ESR : EC 0x21 IL 0x1 ISS 0x00000006
>
> Instruction abort: Translation fault, second level
>
> Stack dump:
> 00000476E4680: 0000000000000001 0000000000000004 00000000476E4700
> 00000000476F3980
> 00000476E46A0: 000000037FD40CBD 0000000000000003 000000037FC00000
> 000000037FD39000
> 00000476E46C0: 0060000000000400 FF9F000000000B3F 00000000476E4780
> 000000037FD3BE70
> 00000476E46E0: 000000037FC00000 0000000000000002 000000037F106000
> 000000037F10B000
> 00000476E4700: 0000000000000FF0 00000000001FFFFF 000000037FD39000
> 000000037F106000
> 00000476E4720: 0000000000000003 000000037F10BFF0 0060000000000400
> FF9F000000000B3F
> 00000476E4740: 000000037FD39000 000000037FD39000 00000000476E4780
> 0060000000000403
> 00000476E4760: 0000000C00000001 000000037FD3F90E 0000000000000400
> 000000037F10B000
> > 00000476E4780: 00000000476E4830 000000037FD3BE70 000000037CB40000 0000000000000001
> 00000476E47A0: 000000037F10B000 0000000047FFE000 0000000000000068
> 000000003FFFFFFF
> 00000476E47C0: 000000037FD39000 000000037F10C528 0000000000000002
> 0000000047FFE068
> 00000476E47E0: 0060000000000400 FF9F000000000B3F 0000000300000001
> 000000037FD39000
> 00000476E4800: 000000017FD40CBD 0060000000000401 0000001500000001
> 000000037FD3F90E
> 00000476E4820: 0060000000000400 000000037F106000 00000000476E48E0
> 000000037FD3BE70
> 00000476E4840: 000000037CB40000 0000000000000000 0000000047FFE000
> 0000000047FFF000
> 00000476E4860: 0000000000000000 0000007FFFFFFFFF 000000037FD39000
> 000000037F10C528
> ASSERT [ArmCpuDxe]
> /root/edk2/ArmPkg/Library/DefaultExceptionHandlerLib/AArch64/DefaultExceptionHandler.c(333):
> ((BOOLEAN)(0==1))
>
>
>
> The full log is available here:
> https://gitlab.com/osteffen/thunderx2-debug/-/raw/main/2023-05-19/85.log?inline=false
>
> Debug files, firmware binaries, and the full build tree are here:
> https://gitlab.com/osteffen/thunderx2-debug/-/tree/main/2023-05-19
>
> I am able to reproduce this quickly, so any ideas for what I can try
> are welcome :-)
>
> Thanks
> -Oliver
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-05-19 21:36 ` Ard Biesheuvel
@ 2023-05-20 8:37 ` Oliver Steffen
2023-05-20 9:20 ` Ard Biesheuvel
0 siblings, 1 reply; 37+ messages in thread
From: Oliver Steffen @ 2023-05-20 8:37 UTC (permalink / raw)
To: Ard Biesheuvel; +Cc: devel, Gerd Hoffmann, Marc Zyngier, dann.frazier
Quoting Ard Biesheuvel (2023-05-19 23:36:53)
> On Fri, 19 May 2023 at 18:32, Oliver Steffen <osteffen@redhat.com> wrote:
> >
> >
> > Hi all,
> >
> > I had another look at this and I can now reproduce the issue consistently,
> > with a quite minimal setup, on recent Linux kernel, Qemu, and EDK2.
> > It requires rebooting the guest in a tight loop. It happens in silent
> > and verbose
> > builds alike, but since the verbose ones are slowed down by the serial
> > output, it
> > takes longer to hit the issue.
> > It is possible to reproduce it with the silent builds within a few minutes.
> > For the verbose case I recommend running multiple Qemu instances in parallel (as
> > many as the machine allows, in my case ~100).
> >
>
> Thanks a lot for all these details, this is extremely helpful.
>
> So what appears to be happening is that we split the 2M block mapping
> that covers the code that we were called from, and hit a level 2
> translation fault because the updated page table entry is still
> observed to be in its transient 'invalid' state as we return to it.
>
> Could you please check whether this makes a difference?
>
> --- a/ArmPkg/Library/ArmMmuLib/AArch64/ArmMmuLibReplaceEntry.S
> +++ b/ArmPkg/Library/ArmMmuLib/AArch64/ArmMmuLibReplaceEntry.S
> @@ -65,6 +65,7 @@
> // write updated entry
> str x1, [x0]
> dsb nshst
> + isb
>
> .L2_\@:
> .endm
That fixes it - no crash observed within 150k iterations.
Thanks, Ard!
- Oliver
>
>
> > Details:
> >
> > CPU: Cavium ThunderX2(R) CPU CN9975
> > Tested on 3 different machines:
> > HPE apache, HPE apollo, Gigabyte R181
> > Kernels tested:
> > - 6.2.15-100.fc36.aarch64
> > - 5.14.0-312.el9.aarch64
> > (contains 406504c7b0405d74d74c15a667cd4c4620c3e7a9,
> > "KVM: arm64: Fix S1PTW handling on RO memslots")
> > Qemu v8.0.0 (RHEL version and build from upstream repo)
> > EDK2: master branch from 2023-05-16 (cafb4f3f)
> > gcc 11.3.1
> >
> > EDK2 build command line:
> > build \
> > -a AARCH64
> > -p ArmVirtPkg/ArmVirtQemu.dsc
> > -t GCC5 -b DEBUG \
> > -D NETWORK_IP6_ENABLE \
> > -D NETWORK_HTTP_BOOT_ENABLE \
> > -D NETWORK_TLS_ENABLE \
> > -D NETWORK_ISCSI_ENABLE \
> > -D NETWORK_ALLOW_HTTP_CONNECTIONS \
> > -D CAVIUM_ERRATUM_27456=TRUE \
> > -D TPM2_ENABLE=TRUE \
> > -D TPM1_ENABLE=FALSE \
> > -D DEBUG_PRINT_ERROR_LEVEL=0x80000000 \
> > -D BUILD_SHELL=TRUE \
> > --pcd="gEfiShellPkgTokenSpaceGuid.PcdShellDefaultDelay=0" \
> > --pcd="gEfiMdePkgTokenSpaceGuid.PcdPlatformBootTimeOut=0" \
> > --hash --cmd-len=65536
> >
> > To reproduce the issue I launched the firmware in Qemu and have it do
> > a reboot once it finished booting up
> > via a startup.nsh on the ESP.
> >
> > Qemu command line:
> > qemu-system-aarch64 \
> > -machine virt,accel=kvm -m 13G \
> > -boot menu=off \
> > -cpu host \
> > -blockdev node-name=code,driver=file,filename="${FW_CODE}",read-only=on \
> > -blockdev node-name=vars,driver=file,filename="${FW_VARS}" \
> > -machine pflash0=code \
> > -machine pflash1=vars \
> > -serial stdio \
> > -net none \
> > -drive file=esp.img,snapshot=on
> >
> > Other things like number of CPUs or the presence of a vTPM have no
> > influence. I did not try different amounts of RAM yet.
> >
> > Serial output:
> > [...]
> > InitializeDxeNxMemoryProtectionPolicy: StackBase = 0x00000000476C5000
> > StackSize = 0x0000000000020000
> > InitializeDxeNxMemoryProtectionPolicy: applying strict permissions to
> > active memory regions
> > SetUefiImageMemoryAttributes - 0x0000000040000000 - 0x00000000076E5000
> > (0x0000000000004000)
> > UpdateRegionMappingRecursive(0): 40000000 - 476E5000 set
> > 60000000000400 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(1): 40000000 - 476E5000 set
> > 60000000000400 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(2): 40000000 - 476E5000 set
> > 60000000000400 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(3): 47600000 - 476E5000 set
> > 60000000000400 clr FF9F000000000B3F
> > SetUefiImageMemoryAttributes - 0x00000000476C5000 - 0x0000000000001000
> > (0x0000000000006000)
> > UpdateRegionMappingRecursive(0): 476C5000 - 476C6000 set
> > 60000000000000 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(1): 476C5000 - 476C6000 set
> > 60000000000000 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(2): 476C5000 - 476C6000 set
> > 60000000000000 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(3): 476C5000 - 476C6000 set
> > 60000000000000 clr FF9F000000000B3F
> > SetUefiImageMemoryAttributes - 0x000000004772B000 - 0x00000000007C0000
> > (0x0000000000004000)
> > UpdateRegionMappingRecursive(0): 4772B000 - 47EEB000 set
> > 60000000000400 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(1): 4772B000 - 47EEB000 set
> > 60000000000400 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(2): 4772B000 - 47EEB000 set
> > 60000000000400 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(3): 4772B000 - 47800000 set
> > 60000000000400 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(3): 47E00000 - 47EEB000 set
> > 60000000000400 clr FF9F000000000B3F
> > SetUefiImageMemoryAttributes - 0x0000000047EF3000 - 0x0000000000101000
> > (0x0000000000004000)
> > UpdateRegionMappingRecursive(0): 47EF3000 - 47FF4000 set
> > 60000000000400 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(1): 47EF3000 - 47FF4000 set
> > 60000000000400 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(2): 47EF3000 - 47FF4000 set
> > 60000000000400 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(3): 47EF3000 - 47FF4000 set
> > 60000000000400 clr FF9F000000000B3F
> > SetUefiImageMemoryAttributes - 0x0000000047FFA000 - 0x0000000334AA6000
> > (0x0000000000004000)
> > UpdateRegionMappingRecursive(0): 47FFA000 - 37CAA0000 set
> > 60000000000400 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(1): 47FFA000 - 37CAA0000 set
> > 60000000000400 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(2): 47FFA000 - 80000000 set
> > 60000000000400 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(3): 47FFA000 - 48000000 set
> > 60000000000400 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(2): 340000000 - 380000000 set 70C clr 0
> > UpdateRegionMappingRecursive(3): 37F000000 - 37F200000 set 70C clr 0
> > UpdateRegionMappingRecursive(2): 340000000 - 37CAA0000 set
> > 60000000000400 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(3): 37CA00000 - 37CC00000 set 70C clr 0
> > UpdateRegionMappingRecursive(3): 37CA00000 - 37CAA0000 set
> > 60000000000400 clr FF9F000000000B3F
> > SetUefiImageMemoryAttributes - 0x000000037CB40000 - 0x00000000031F9000
> > (0x0000000000004000)
> > UpdateRegionMappingRecursive(0): 37CB40000 - 37FD39000 set
> > 60000000000400 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(1): 37CB40000 - 37FD39000 set
> > 60000000000400 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(2): 37CB40000 - 37FD39000 set
> > 60000000000400 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(3): 37CB40000 - 37CC00000 set
> > 60000000000400 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(3): 37F000000 - 37F200000 set
> > 60000000000400 clr FF9F000000000B3F
> > UpdateRegionMappingRecursive(3): 37FC00000 - 37FE00000 set 70C clr 0
> > UpdateRegionMappingRecursive(3): 37FC00000 - 37FD39000 set
> > 60000000000400 clr FF9F000000000B3F
> >
> >
> > Synchronous Exception at 0x000000037FD3C0A8
> > PC 0x00037FD3C0A8 (0x00037FD39000+0x000030A8) [ 0] ArmCpuDxe.dll
> > PC 0x00037FD3C0A8 (0x00037FD39000+0x000030A8) [ 0] ArmCpuDxe.dll
> > PC 0x00037FD3BE70 (0x00037FD39000+0x00002E70) [ 0] ArmCpuDxe.dll
> > PC 0x00037FD3BE70 (0x00037FD39000+0x00002E70) [ 0] ArmCpuDxe.dll
> > PC 0x00037FD3C2E4 (0x00037FD39000+0x000032E4) [ 0] ArmCpuDxe.dll
> > PC 0x0000476E78F8 (0x0000476E5000+0x000028F8) [ 1] DxeCore.dll
> > PC 0x0000476ED680 (0x0000476E5000+0x00008680) [ 1] DxeCore.dll
> > PC 0x0000476F2744 (0x0000476E5000+0x0000D744) [ 1] DxeCore.dll
> > PC 0x0000476ECDE8 (0x0000476E5000+0x00007DE8) [ 1] DxeCore.dll
> > PC 0x00037FD3D2DC (0x00037FD39000+0x000042DC) [ 2] ArmCpuDxe.dll
> > PC 0x0000476EC788 (0x0000476E5000+0x00007788) [ 3] DxeCore.dll
> > PC 0x0000476F9CA8 (0x0000476E5000+0x00014CA8) [ 3] DxeCore.dll
> > PC 0x0000476EFEF0 (0x0000476E5000+0x0000AEF0) [ 3] DxeCore.dll
> >
> > [ 0] /root/edk2/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/ArmPkg/Drivers/CpuDxe/CpuDxe/DEBUG/ArmCpuDxe.dll
> > [ 1] /root/edk2/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/MdeModulePkg/Core/Dxe/DxeMain/DEBUG/DxeCore.dll
> > [ 2] /root/edk2/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/ArmPkg/Drivers/CpuDxe/CpuDxe/DEBUG/ArmCpuDxe.dll
> > [ 3] /root/edk2/Build/ArmVirtQemu-AARCH64/DEBUG_GCC5/AARCH64/MdeModulePkg/Core/Dxe/DxeMain/DEBUG/DxeCore.dll
> >
> > X0 0x000000037F10BFF0 X1 0x000000037F106003 X2
> > 0x000000000037FC00 X3 0x0000000000000000
> > X4 0x0000000000000200 X5 0x0000000000000004 X6
> > 0x0000000000000000 X7 0x000000037FD3F4B5
> > X8 0x0000000000000000 X9 0x0000000000000002 X10
> > 0x0000000000000000 X11 0x0000000000000000
> > X12 0x0000000000000002 X13 0x0000000000000002 X14
> > 0x0000000000000001 X15 0x0000000000000002
> > X16 0x000000037FD3A268 X17 0x00000000007AFA10 X18
> > 0x0000000000000000 X19 0x000000037FC00000
> > X20 0x0000000000000002 X21 0x000000037F106003 X22
> > 0x000000037F10B000 X23 0x000000037FD42000
> > X24 0x00000000001FFFFF X25 0x000000037FD39000 X26
> > 0x000000037F106000 X27 0x0000000000000003
> > X28 0x000000037F10BFF0 FP 0x00000000476E4780 LR 0x000000037FD3C0A8
> >
> > V0 0x0000000000000000 0000000000000000 V1 0x0000000000000000
> > 0000000000000000
> > V2 0x0000000000000000 0000000000000000 V3 0x0000000000000000
> > 0000000000000000
> > V4 0x0000000000000000 0000000000000000 V5 0x0000000000000000
> > 0000000000000000
> > V6 0x0000000000000000 0000000000000000 V7 0x0000000000000000
> > 0000000000000000
> > V8 0x0000000000000000 0000000000000000 V9 0x0000000000000000
> > 0000000000000000
> > V10 0x0000000000000000 0000000000000000 V11 0x0000000000000000
> > 0000000000000000
> > V12 0x0000000000000000 0000000000000000 V13 0x0000000000000000
> > 0000000000000000
> > V14 0x0000000000000000 0000000000000000 V15 0x0000000000000000
> > 0000000000000000
> > V16 0x0000000000000000 0000000000000000 V17 0x0000000000000000
> > 0000000000000000
> > V18 0x0000000000000000 0000000000000000 V19 0x0000000000000000
> > 0000000000000000
> > V20 0x0000000000000000 0000000000000000 V21 0x0000000000000000
> > 0000000000000000
> > V22 0x0000000000000000 0000000000000000 V23 0x0000000000000000
> > 0000000000000000
> > V24 0x0000000000000000 0000000000000000 V25 0x0000000000000000
> > 0000000000000000
> > V26 0x0000000000000000 0000000000000000 V27 0x0000000000000000
> > 0000000000000000
> > V28 0x0000000000000000 0000000000000000 V29 0x0000000000000000
> > 0000000000000000
> > V30 0x0000000000000000 0000000000000000 V31 0x0000000000000000
> > 0000000000000000
> >
> > SP 0x00000000476E4780 ELR 0x000000037FD3C0A8 SPSR 0x80000205 FPSR
> > 0x00000000
> > ESR 0x86000006 FAR 0x000000037FD3C0A8
> >
> > ESR : EC 0x21 IL 0x1 ISS 0x00000006
> >
> > Instruction abort: Translation fault, second level
> >
> > Stack dump:
> > 00000476E4680: 0000000000000001 0000000000000004 00000000476E4700
> > 00000000476F3980
> > 00000476E46A0: 000000037FD40CBD 0000000000000003 000000037FC00000
> > 000000037FD39000
> > 00000476E46C0: 0060000000000400 FF9F000000000B3F 00000000476E4780
> > 000000037FD3BE70
> > 00000476E46E0: 000000037FC00000 0000000000000002 000000037F106000
> > 000000037F10B000
> > 00000476E4700: 0000000000000FF0 00000000001FFFFF 000000037FD39000
> > 000000037F106000
> > 00000476E4720: 0000000000000003 000000037F10BFF0 0060000000000400
> > FF9F000000000B3F
> > 00000476E4740: 000000037FD39000 000000037FD39000 00000000476E4780
> > 0060000000000403
> > 00000476E4760: 0000000C00000001 000000037FD3F90E 0000000000000400
> > 000000037F10B000
> > > 00000476E4780: 00000000476E4830 000000037FD3BE70 000000037CB40000 0000000000000001
> > 00000476E47A0: 000000037F10B000 0000000047FFE000 0000000000000068
> > 000000003FFFFFFF
> > 00000476E47C0: 000000037FD39000 000000037F10C528 0000000000000002
> > 0000000047FFE068
> > 00000476E47E0: 0060000000000400 FF9F000000000B3F 0000000300000001
> > 000000037FD39000
> > 00000476E4800: 000000017FD40CBD 0060000000000401 0000001500000001
> > 000000037FD3F90E
> > 00000476E4820: 0060000000000400 000000037F106000 00000000476E48E0
> > 000000037FD3BE70
> > 00000476E4840: 000000037CB40000 0000000000000000 0000000047FFE000
> > 0000000047FFF000
> > 00000476E4860: 0000000000000000 0000007FFFFFFFFF 000000037FD39000
> > 000000037F10C528
> > ASSERT [ArmCpuDxe]
> > /root/edk2/ArmPkg/Library/DefaultExceptionHandlerLib/AArch64/DefaultExceptionHandler.c(333):
> > ((BOOLEAN)(0==1))
> >
> >
> >
> > The full log is available here:
> > https://gitlab.com/osteffen/thunderx2-debug/-/raw/main/2023-05-19/85.log?inline=false
> >
> > Debug files, firmware binaries, and the full build tree are here:
> > https://gitlab.com/osteffen/thunderx2-debug/-/tree/main/2023-05-19
> >
> > I am able to reproduce this quickly, so any ideas for what I can try
> > are welcome :-)
> >
> > Thanks
> > -Oliver
> >
>
--
🎩Oliver Steffen (he/him) - Software Engineer, Virtualization
Red Hat GmbH <https://www.redhat.com/de/global/dach>,
Registered seat: Werner-von-Siemens-Ring 12, D-85630 Grasbrunn, Germany
Commercial register: Amtsgericht München/Munich, HRB 153243,
Managing Directors: Ryan Barnhart, Charles Cachera, Michael O'Neill,
Amy Ross
Everyone has different working hours… Please do not feel obligated to
reply outside of your normal work schedule.
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [edk2-devel] [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX
2023-05-20 8:37 ` Oliver Steffen
@ 2023-05-20 9:20 ` Ard Biesheuvel
0 siblings, 0 replies; 37+ messages in thread
From: Ard Biesheuvel @ 2023-05-20 9:20 UTC (permalink / raw)
To: Oliver Steffen; +Cc: devel, Gerd Hoffmann, Marc Zyngier, dann.frazier
On Sat, 20 May 2023 at 10:37, Oliver Steffen <osteffen@redhat.com> wrote:
>
> Quoting Ard Biesheuvel (2023-05-19 23:36:53)
> > On Fri, 19 May 2023 at 18:32, Oliver Steffen <osteffen@redhat.com> wrote:
> > >
> > >
> > > Hi all,
> > >
> > > I had another look at this and I can now reproduce the issue consistently,
> > > with a quite minimal setup, on recent Linux kernel, Qemu, and EDK2.
> > > It requires rebooting the guest in a tight loop. It happens in silent
> > > and verbose
> > > builds alike, but since the verbose ones are slowed down by the serial
> > > output, it
> > > takes longer to hit the issue.
> > > It is possible to reproduce it with the silent builds within a few minutes.
> > > For the verbose case I recommend running multiple Qemu instances in parallel (as
> > > many as the machine allows, in my case ~100).
> > >
> >
> > Thanks a lot for all these details, this is extremely helpful.
> >
> > So what appears to be happening is that we split the 2M block mapping
> > that covers the code that we were called from, and hit a level 2
> > translation fault because the updated page table entry is still
> > observed to be in its transient 'invalid' state as we return to it.
> >
> > Could you please check whether this makes a difference?
> >
> > --- a/ArmPkg/Library/ArmMmuLib/AArch64/ArmMmuLibReplaceEntry.S
> > +++ b/ArmPkg/Library/ArmMmuLib/AArch64/ArmMmuLibReplaceEntry.S
> > @@ -65,6 +65,7 @@
> > // write updated entry
> > str x1, [x0]
> > dsb nshst
> > + isb
> >
> > .L2_\@:
> > .endm
>
> That fixes it - no crash observed within 150k iterations.
> Thanks, Ard!
>
Fantastic! Thanks a lot for all the effort in tracking this down.
^ permalink raw reply [flat|nested] 37+ messages in thread
end of thread, other threads:[~2023-05-20 9:20 UTC | newest]
Thread overview: 37+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-01-05 16:25 [PATCH v2 1/2] ArmVirtPkg/ArmPlatformLibQemu: Ensure that VFP is on before running C code Ard Biesheuvel
2023-01-05 16:25 ` [PATCH v2 2/2] ArmVirtPkg/ArmVirtQemu: Avoid early ID map on ThunderX Ard Biesheuvel
2023-01-10 0:08 ` dann frazier
2023-01-17 12:47 ` [edk2-devel] " Oliver Steffen
2023-01-17 14:53 ` Ard Biesheuvel
2023-01-18 6:36 ` Oliver Steffen
2023-01-18 7:34 ` Ard Biesheuvel
2023-01-18 8:27 ` Oliver Steffen
2023-01-18 8:48 ` Ard Biesheuvel
2023-01-18 9:22 ` Ard Biesheuvel
2023-01-19 11:03 ` Oliver Steffen
2023-01-19 11:11 ` Ard Biesheuvel
2023-01-19 11:25 ` Oliver Steffen
2023-01-19 11:55 ` Marc Zyngier
2023-01-19 12:21 ` Ard Biesheuvel
2023-01-19 12:00 ` Gerd Hoffmann
2023-01-19 12:55 ` Oliver Steffen
2023-01-19 13:21 ` Ard Biesheuvel
2023-01-26 12:01 ` Gerd Hoffmann
2023-01-26 13:48 ` Marc Zyngier
2023-01-26 14:46 ` Gerd Hoffmann
2023-01-26 15:08 ` Marc Zyngier
2023-02-01 9:13 ` Oliver Steffen
2023-02-01 11:51 ` Ard Biesheuvel
2023-02-01 12:58 ` Oliver Steffen
2023-02-01 13:29 ` Ard Biesheuvel
2023-02-02 11:09 ` Oliver Steffen
[not found] ` <173FFD60429C89C3.3213@groups.io>
2023-02-07 10:51 ` Oliver Steffen
2023-02-07 11:56 ` Ard Biesheuvel
2023-02-07 12:58 ` Oliver Steffen
2023-02-09 15:15 ` Ard Biesheuvel
2023-03-02 10:50 ` Ard Biesheuvel
2023-03-02 13:29 ` Oliver Steffen
[not found] ` <17489D498A098DB9.9697@groups.io>
2023-05-19 16:32 ` Oliver Steffen
2023-05-19 21:36 ` Ard Biesheuvel
2023-05-20 8:37 ` Oliver Steffen
2023-05-20 9:20 ` Ard Biesheuvel
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox