#include "arm_asm.h"
// Copyright 2022-2026 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License").  You may not use
// this file except in compliance with the License.  You can obtain a copy
// in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html

//
// This module implements SM4 with ASIMD and AESE on AARCH64
//
// Dec 2022
//

// $output is the last argument if it looks like a file (it has an extension)
// $flavour is the first argument if it doesn't look like a file
#include "arm_arch.h"
.arch	armv8-a+crypto
.text

.type	_vpsm4_ex_consts,%object
.align	7
_vpsm4_ex_consts:
.Lck:
.long	0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
.long	0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
.long	0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
.long	0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
.long	0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
.long	0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
.long	0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
.long	0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
.Lfk:
.quad	0x56aa3350a3b1bac6,0xb27022dc677d9197
.Lshuffles:
.quad	0x0B0A090807060504,0x030201000F0E0D0C
.Lxts_magic:
#ifndef __AARCH64EB__
.quad	0x0101010101010187,0x0101010101010101
#else
.quad	0x0101010101010101,0x0101010101010187
#endif
.Lsbox_magic:
#ifndef __AARCH64EB__
.quad	0x0b0e0104070a0d00,0x0306090c0f020508
.quad	0x62185a2042387a00,0x22581a6002783a40
.quad	0x15df62a89e54e923,0xc10bb67c4a803df7
.quad	0xb9aa6b78c1d21300,0x1407c6d56c7fbead
.quad	0x6404462679195b3b,0xe383c1a1fe9edcbc
#else
.quad	0x0306090c0f020508,0x0b0e0104070a0d00
.quad	0x22581a6002783a40,0x62185a2042387a00
.quad	0xc10bb67c4a803df7,0x15df62a89e54e923
.quad	0x1407c6d56c7fbead,0xb9aa6b78c1d21300
.quad	0xe383c1a1fe9edcbc,0x6404462679195b3b
#endif
.quad	0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f

.size	_vpsm4_ex_consts,.-_vpsm4_ex_consts
.type	_vpsm4_ex_set_key,%function
.align	4
_vpsm4_ex_set_key:
	AARCH64_VALID_CALL_TARGET
	ld1	{v5.4s},[x0]
	adrp	x9, .Lsbox_magic
	add	x9, x9, #:lo12:.Lsbox_magic
	ldr	q26, [x9]
	ldr	q27, [x9, 16]
	ldr	q28, [x9, 32]
	ldr	q29, [x9, 48]
	ldr	q30, [x9, 64]
	ldr	q31, [x9, 80]
#ifndef __AARCH64EB__
	rev32	v5.16b,v5.16b
#endif
	adrp	x5,.Lshuffles
	add	x5,x5,#:lo12:.Lshuffles
	ld1	{v7.2d},[x5]
	adrp	x5,.Lfk
	add	x5,x5,#:lo12:.Lfk
	ld1	{v6.2d},[x5]
	eor	v5.16b,v5.16b,v6.16b
	mov	x6,#32
	adrp	x5,.Lck
	add	x5,x5,#:lo12:.Lck
	movi	v0.16b,#64
	cbnz	w2,1f
	add	x1,x1,124
1:
	mov	w7,v5.s[1]
	ldr	w8,[x5],#4
	eor	w8,w8,w7
	mov	w7,v5.s[2]
	eor	w8,w8,w7
	mov	w7,v5.s[3]
	eor	w8,w8,w7
	// optimize sbox using AESE instruction
	mov	v4.s[0],w8
	tbl	v0.16b, {v4.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	mov	w7,v0.s[0]
	eor	w8,w7,w7,ror #19
	eor	w8,w8,w7,ror #9
	mov	w7,v5.s[0]
	eor	w8,w8,w7
	mov	v5.s[0],w8
	cbz	w2,2f
	str	w8,[x1],#4
	b	3f
2:
	str	w8,[x1],#-4
3:
	tbl	v5.16b,{v5.16b},v7.16b
	subs	x6,x6,#1
	b.ne	1b
	ret
.size	_vpsm4_ex_set_key,.-_vpsm4_ex_set_key
.type	_vpsm4_ex_enc_4blks,%function
.align	4
_vpsm4_ex_enc_4blks:
	AARCH64_VALID_CALL_TARGET
	mov	x10,x3
	mov	w11,#8
10:
	ldp	w7,w8,[x10],8
	dup	v12.4s,w7
	dup	v13.4s,w8

	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	eor	v14.16b,v6.16b,v7.16b
	eor	v12.16b,v5.16b,v12.16b
	eor	v12.16b,v14.16b,v12.16b
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v12.16b}, v26.16b
	ushr	v24.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v24.16b, {v27.16b}, v24.16b
	eor	v0.16b, v0.16b, v24.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v24.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v24.16b, {v29.16b}, v24.16b
	eor	v0.16b, v0.16b, v24.16b
	mov	v12.16b,v0.16b

	// linear transformation
	ushr	v0.4s,v12.4s,32-2
	ushr	v1.4s,v12.4s,32-10
	ushr	v2.4s,v12.4s,32-18
	ushr	v3.4s,v12.4s,32-24
	sli	v0.4s,v12.4s,2
	sli	v1.4s,v12.4s,10
	sli	v2.4s,v12.4s,18
	sli	v3.4s,v12.4s,24
	eor	v24.16b,v0.16b,v12.16b
	eor	v24.16b,v24.16b,v1.16b
	eor	v12.16b,v2.16b,v3.16b
	eor	v12.16b,v12.16b,v24.16b
	eor	v4.16b,v4.16b,v12.16b

	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	eor	v14.16b,v14.16b,v4.16b
	eor	v13.16b,v14.16b,v13.16b
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v13.16b}, v26.16b
	ushr	v24.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v24.16b, {v27.16b}, v24.16b
	eor	v0.16b, v0.16b, v24.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v24.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v24.16b, {v29.16b}, v24.16b
	eor	v0.16b, v0.16b, v24.16b
	mov	v13.16b,v0.16b

	// linear transformation
	ushr	v0.4s,v13.4s,32-2
	ushr	v1.4s,v13.4s,32-10
	ushr	v2.4s,v13.4s,32-18
	ushr	v3.4s,v13.4s,32-24
	sli	v0.4s,v13.4s,2
	sli	v1.4s,v13.4s,10
	sli	v2.4s,v13.4s,18
	sli	v3.4s,v13.4s,24
	eor	v24.16b,v0.16b,v13.16b
	eor	v24.16b,v24.16b,v1.16b
	eor	v13.16b,v2.16b,v3.16b
	eor	v13.16b,v13.16b,v24.16b
	ldp	w7,w8,[x10],8
	eor	v5.16b,v5.16b,v13.16b

	dup	v12.4s,w7
	dup	v13.4s,w8

	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	eor	v14.16b,v4.16b,v5.16b
	eor	v12.16b,v7.16b,v12.16b
	eor	v12.16b,v14.16b,v12.16b
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v12.16b}, v26.16b
	ushr	v24.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v24.16b, {v27.16b}, v24.16b
	eor	v0.16b, v0.16b, v24.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v24.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v24.16b, {v29.16b}, v24.16b
	eor	v0.16b, v0.16b, v24.16b
	mov	v12.16b,v0.16b

	// linear transformation
	ushr	v0.4s,v12.4s,32-2
	ushr	v1.4s,v12.4s,32-10
	ushr	v2.4s,v12.4s,32-18
	ushr	v3.4s,v12.4s,32-24
	sli	v0.4s,v12.4s,2
	sli	v1.4s,v12.4s,10
	sli	v2.4s,v12.4s,18
	sli	v3.4s,v12.4s,24
	eor	v24.16b,v0.16b,v12.16b
	eor	v24.16b,v24.16b,v1.16b
	eor	v12.16b,v2.16b,v3.16b
	eor	v12.16b,v12.16b,v24.16b
	eor	v6.16b,v6.16b,v12.16b

	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	eor	v14.16b,v14.16b,v6.16b
	eor	v13.16b,v14.16b,v13.16b
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v13.16b}, v26.16b
	ushr	v24.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v24.16b, {v27.16b}, v24.16b
	eor	v0.16b, v0.16b, v24.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v24.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v24.16b, {v29.16b}, v24.16b
	eor	v0.16b, v0.16b, v24.16b
	mov	v13.16b,v0.16b

	// linear transformation
	ushr	v0.4s,v13.4s,32-2
	ushr	v1.4s,v13.4s,32-10
	ushr	v2.4s,v13.4s,32-18
	ushr	v3.4s,v13.4s,32-24
	sli	v0.4s,v13.4s,2
	sli	v1.4s,v13.4s,10
	sli	v2.4s,v13.4s,18
	sli	v3.4s,v13.4s,24
	eor	v24.16b,v0.16b,v13.16b
	eor	v24.16b,v24.16b,v1.16b
	eor	v13.16b,v2.16b,v3.16b
	eor	v13.16b,v13.16b,v24.16b
	eor	v7.16b,v7.16b,v13.16b
	subs	w11,w11,#1
	b.ne	10b
#ifndef __AARCH64EB__
	rev32	v3.16b,v4.16b
#else
	mov	v3.16b,v4.16b
#endif
#ifndef __AARCH64EB__
	rev32	v2.16b,v5.16b
#else
	mov	v2.16b,v5.16b
#endif
#ifndef __AARCH64EB__
	rev32	v1.16b,v6.16b
#else
	mov	v1.16b,v6.16b
#endif
#ifndef __AARCH64EB__
	rev32	v0.16b,v7.16b
#else
	mov	v0.16b,v7.16b
#endif
	ret
.size	_vpsm4_ex_enc_4blks,.-_vpsm4_ex_enc_4blks
.type	_vpsm4_ex_enc_8blks,%function
.align	4
_vpsm4_ex_enc_8blks:
	AARCH64_VALID_CALL_TARGET
	mov	x10,x3
	mov	w11,#8
10:
	ldp	w7,w8,[x10],8
	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	dup	v12.4s,w7
	eor	v14.16b,v6.16b,v7.16b
	eor	v15.16b,v10.16b,v11.16b
	eor	v0.16b,v5.16b,v12.16b
	eor	v1.16b,v9.16b,v12.16b
	eor	v12.16b,v14.16b,v0.16b
	eor	v13.16b,v15.16b,v1.16b
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v12.16b}, v26.16b
	tbl	v1.16b, {v13.16b}, v26.16b
	ushr	v24.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v24.16b, {v27.16b}, v24.16b
	eor	v0.16b, v0.16b, v24.16b
	ushr	v24.16b, v1.16b, 4
	and	v1.16b, v1.16b, v31.16b
	tbl	v1.16b, {v28.16b}, v1.16b
	tbl	v24.16b, {v27.16b}, v24.16b
	eor	v1.16b, v1.16b, v24.16b
	eor	v25.16b, v25.16b, v25.16b
	aese	v0.16b,v25.16b
	aese	v1.16b,v25.16b
	ushr	v24.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v24.16b, {v29.16b}, v24.16b
	eor	v0.16b, v0.16b, v24.16b
	ushr	v24.16b, v1.16b, 4
	and	v1.16b, v1.16b, v31.16b
	tbl	v1.16b, {v30.16b}, v1.16b
	tbl	v24.16b, {v29.16b}, v24.16b
	eor	v1.16b, v1.16b, v24.16b
	mov	v12.16b,v0.16b
	mov	v13.16b,v1.16b

	// linear transformation
	ushr	v0.4s,v12.4s,32-2
	ushr	v25.4s,v13.4s,32-2
	ushr	v1.4s,v12.4s,32-10
	ushr	v2.4s,v12.4s,32-18
	ushr	v3.4s,v12.4s,32-24
	sli	v0.4s,v12.4s,2
	sli	v25.4s,v13.4s,2
	sli	v1.4s,v12.4s,10
	sli	v2.4s,v12.4s,18
	sli	v3.4s,v12.4s,24
	eor	v24.16b,v0.16b,v12.16b
	eor	v24.16b,v24.16b,v1.16b
	eor	v12.16b,v2.16b,v3.16b
	eor	v12.16b,v12.16b,v24.16b
	ushr	v1.4s,v13.4s,32-10
	ushr	v2.4s,v13.4s,32-18
	ushr	v3.4s,v13.4s,32-24
	sli	v1.4s,v13.4s,10
	sli	v2.4s,v13.4s,18
	sli	v3.4s,v13.4s,24
	eor	v24.16b,v25.16b,v13.16b
	eor	v24.16b,v24.16b,v1.16b
	eor	v13.16b,v2.16b,v3.16b
	eor	v13.16b,v13.16b,v24.16b
	eor	v4.16b,v4.16b,v12.16b
	eor	v8.16b,v8.16b,v13.16b

	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	dup	v13.4s,w8
	eor	v14.16b,v14.16b,v4.16b
	eor	v15.16b,v15.16b,v8.16b
	eor	v12.16b,v14.16b,v13.16b
	eor	v13.16b,v15.16b,v13.16b
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v12.16b}, v26.16b
	tbl	v1.16b, {v13.16b}, v26.16b
	ushr	v24.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v24.16b, {v27.16b}, v24.16b
	eor	v0.16b, v0.16b, v24.16b
	ushr	v24.16b, v1.16b, 4
	and	v1.16b, v1.16b, v31.16b
	tbl	v1.16b, {v28.16b}, v1.16b
	tbl	v24.16b, {v27.16b}, v24.16b
	eor	v1.16b, v1.16b, v24.16b
	eor	v25.16b, v25.16b, v25.16b
	aese	v0.16b,v25.16b
	aese	v1.16b,v25.16b
	ushr	v24.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v24.16b, {v29.16b}, v24.16b
	eor	v0.16b, v0.16b, v24.16b
	ushr	v24.16b, v1.16b, 4
	and	v1.16b, v1.16b, v31.16b
	tbl	v1.16b, {v30.16b}, v1.16b
	tbl	v24.16b, {v29.16b}, v24.16b
	eor	v1.16b, v1.16b, v24.16b
	mov	v12.16b,v0.16b
	mov	v13.16b,v1.16b

	// linear transformation
	ushr	v0.4s,v12.4s,32-2
	ushr	v25.4s,v13.4s,32-2
	ushr	v1.4s,v12.4s,32-10
	ushr	v2.4s,v12.4s,32-18
	ushr	v3.4s,v12.4s,32-24
	sli	v0.4s,v12.4s,2
	sli	v25.4s,v13.4s,2
	sli	v1.4s,v12.4s,10
	sli	v2.4s,v12.4s,18
	sli	v3.4s,v12.4s,24
	eor	v24.16b,v0.16b,v12.16b
	eor	v24.16b,v24.16b,v1.16b
	eor	v12.16b,v2.16b,v3.16b
	eor	v12.16b,v12.16b,v24.16b
	ushr	v1.4s,v13.4s,32-10
	ushr	v2.4s,v13.4s,32-18
	ushr	v3.4s,v13.4s,32-24
	sli	v1.4s,v13.4s,10
	sli	v2.4s,v13.4s,18
	sli	v3.4s,v13.4s,24
	eor	v24.16b,v25.16b,v13.16b
	eor	v24.16b,v24.16b,v1.16b
	eor	v13.16b,v2.16b,v3.16b
	eor	v13.16b,v13.16b,v24.16b
	ldp	w7,w8,[x10],8
	eor	v5.16b,v5.16b,v12.16b
	eor	v9.16b,v9.16b,v13.16b

	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	dup	v12.4s,w7
	eor	v14.16b,v4.16b,v5.16b
	eor	v15.16b,v8.16b,v9.16b
	eor	v0.16b,v7.16b,v12.16b
	eor	v1.16b,v11.16b,v12.16b
	eor	v12.16b,v14.16b,v0.16b
	eor	v13.16b,v15.16b,v1.16b
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v12.16b}, v26.16b
	tbl	v1.16b, {v13.16b}, v26.16b
	ushr	v24.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v24.16b, {v27.16b}, v24.16b
	eor	v0.16b, v0.16b, v24.16b
	ushr	v24.16b, v1.16b, 4
	and	v1.16b, v1.16b, v31.16b
	tbl	v1.16b, {v28.16b}, v1.16b
	tbl	v24.16b, {v27.16b}, v24.16b
	eor	v1.16b, v1.16b, v24.16b
	eor	v25.16b, v25.16b, v25.16b
	aese	v0.16b,v25.16b
	aese	v1.16b,v25.16b
	ushr	v24.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v24.16b, {v29.16b}, v24.16b
	eor	v0.16b, v0.16b, v24.16b
	ushr	v24.16b, v1.16b, 4
	and	v1.16b, v1.16b, v31.16b
	tbl	v1.16b, {v30.16b}, v1.16b
	tbl	v24.16b, {v29.16b}, v24.16b
	eor	v1.16b, v1.16b, v24.16b
	mov	v12.16b,v0.16b
	mov	v13.16b,v1.16b

	// linear transformation
	ushr	v0.4s,v12.4s,32-2
	ushr	v25.4s,v13.4s,32-2
	ushr	v1.4s,v12.4s,32-10
	ushr	v2.4s,v12.4s,32-18
	ushr	v3.4s,v12.4s,32-24
	sli	v0.4s,v12.4s,2
	sli	v25.4s,v13.4s,2
	sli	v1.4s,v12.4s,10
	sli	v2.4s,v12.4s,18
	sli	v3.4s,v12.4s,24
	eor	v24.16b,v0.16b,v12.16b
	eor	v24.16b,v24.16b,v1.16b
	eor	v12.16b,v2.16b,v3.16b
	eor	v12.16b,v12.16b,v24.16b
	ushr	v1.4s,v13.4s,32-10
	ushr	v2.4s,v13.4s,32-18
	ushr	v3.4s,v13.4s,32-24
	sli	v1.4s,v13.4s,10
	sli	v2.4s,v13.4s,18
	sli	v3.4s,v13.4s,24
	eor	v24.16b,v25.16b,v13.16b
	eor	v24.16b,v24.16b,v1.16b
	eor	v13.16b,v2.16b,v3.16b
	eor	v13.16b,v13.16b,v24.16b
	eor	v6.16b,v6.16b,v12.16b
	eor	v10.16b,v10.16b,v13.16b

	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	dup	v13.4s,w8
	eor	v14.16b,v14.16b,v6.16b
	eor	v15.16b,v15.16b,v10.16b
	eor	v12.16b,v14.16b,v13.16b
	eor	v13.16b,v15.16b,v13.16b
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v12.16b}, v26.16b
	tbl	v1.16b, {v13.16b}, v26.16b
	ushr	v24.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v24.16b, {v27.16b}, v24.16b
	eor	v0.16b, v0.16b, v24.16b
	ushr	v24.16b, v1.16b, 4
	and	v1.16b, v1.16b, v31.16b
	tbl	v1.16b, {v28.16b}, v1.16b
	tbl	v24.16b, {v27.16b}, v24.16b
	eor	v1.16b, v1.16b, v24.16b
	eor	v25.16b, v25.16b, v25.16b
	aese	v0.16b,v25.16b
	aese	v1.16b,v25.16b
	ushr	v24.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v24.16b, {v29.16b}, v24.16b
	eor	v0.16b, v0.16b, v24.16b
	ushr	v24.16b, v1.16b, 4
	and	v1.16b, v1.16b, v31.16b
	tbl	v1.16b, {v30.16b}, v1.16b
	tbl	v24.16b, {v29.16b}, v24.16b
	eor	v1.16b, v1.16b, v24.16b
	mov	v12.16b,v0.16b
	mov	v13.16b,v1.16b

	// linear transformation
	ushr	v0.4s,v12.4s,32-2
	ushr	v25.4s,v13.4s,32-2
	ushr	v1.4s,v12.4s,32-10
	ushr	v2.4s,v12.4s,32-18
	ushr	v3.4s,v12.4s,32-24
	sli	v0.4s,v12.4s,2
	sli	v25.4s,v13.4s,2
	sli	v1.4s,v12.4s,10
	sli	v2.4s,v12.4s,18
	sli	v3.4s,v12.4s,24
	eor	v24.16b,v0.16b,v12.16b
	eor	v24.16b,v24.16b,v1.16b
	eor	v12.16b,v2.16b,v3.16b
	eor	v12.16b,v12.16b,v24.16b
	ushr	v1.4s,v13.4s,32-10
	ushr	v2.4s,v13.4s,32-18
	ushr	v3.4s,v13.4s,32-24
	sli	v1.4s,v13.4s,10
	sli	v2.4s,v13.4s,18
	sli	v3.4s,v13.4s,24
	eor	v24.16b,v25.16b,v13.16b
	eor	v24.16b,v24.16b,v1.16b
	eor	v13.16b,v2.16b,v3.16b
	eor	v13.16b,v13.16b,v24.16b
	eor	v7.16b,v7.16b,v12.16b
	eor	v11.16b,v11.16b,v13.16b
	subs	w11,w11,#1
	b.ne	10b
#ifndef __AARCH64EB__
	rev32	v3.16b,v4.16b
#else
	mov	v3.16b,v4.16b
#endif
#ifndef __AARCH64EB__
	rev32	v2.16b,v5.16b
#else
	mov	v2.16b,v5.16b
#endif
#ifndef __AARCH64EB__
	rev32	v1.16b,v6.16b
#else
	mov	v1.16b,v6.16b
#endif
#ifndef __AARCH64EB__
	rev32	v0.16b,v7.16b
#else
	mov	v0.16b,v7.16b
#endif
#ifndef __AARCH64EB__
	rev32	v7.16b,v8.16b
#else
	mov	v7.16b,v8.16b
#endif
#ifndef __AARCH64EB__
	rev32	v6.16b,v9.16b
#else
	mov	v6.16b,v9.16b
#endif
#ifndef __AARCH64EB__
	rev32	v5.16b,v10.16b
#else
	mov	v5.16b,v10.16b
#endif
#ifndef __AARCH64EB__
	rev32	v4.16b,v11.16b
#else
	mov	v4.16b,v11.16b
#endif
	ret
.size	_vpsm4_ex_enc_8blks,.-_vpsm4_ex_enc_8blks
.globl	vpsm4_ex_set_encrypt_key
.type	vpsm4_ex_set_encrypt_key,%function
.align	5
vpsm4_ex_set_encrypt_key:
	AARCH64_SIGN_LINK_REGISTER
	stp	x29,x30,[sp,#-16]!
	mov	w2,1
	bl	_vpsm4_ex_set_key
	ldp	x29,x30,[sp],#16
	AARCH64_VALIDATE_LINK_REGISTER
	ret
.size	vpsm4_ex_set_encrypt_key,.-vpsm4_ex_set_encrypt_key
.globl	vpsm4_ex_set_decrypt_key
.type	vpsm4_ex_set_decrypt_key,%function
.align	5
vpsm4_ex_set_decrypt_key:
	AARCH64_SIGN_LINK_REGISTER
	stp	x29,x30,[sp,#-16]!
	mov	w2,0
	bl	_vpsm4_ex_set_key
	ldp	x29,x30,[sp],#16
	AARCH64_VALIDATE_LINK_REGISTER
	ret
.size	vpsm4_ex_set_decrypt_key,.-vpsm4_ex_set_decrypt_key
.globl	vpsm4_ex_encrypt
.type	vpsm4_ex_encrypt,%function
.align	5
vpsm4_ex_encrypt:
	AARCH64_VALID_CALL_TARGET
	ld1	{v4.4s},[x0]
	adrp	x9, .Lsbox_magic
	add	x9, x9, #:lo12:.Lsbox_magic
	ldr	q26, [x9]
	ldr	q27, [x9, 16]
	ldr	q28, [x9, 32]
	ldr	q29, [x9, 48]
	ldr	q30, [x9, 64]
	ldr	q31, [x9, 80]
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
	mov	x3,x2
	mov	x10,x3
	mov	w11,#8
	mov	w12,v4.s[0]
	mov	w13,v4.s[1]
	mov	w14,v4.s[2]
	mov	w15,v4.s[3]
10:
	ldp	w7,w8,[x10],8
	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	eor	w6,w14,w15
	eor	w9,w7,w13
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w12,w12,w6
	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	eor	w6,w14,w15
	eor	w9,w12,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	ldp	w7,w8,[x10],8
	eor	w13,w13,w6
	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	eor	w6,w12,w13
	eor	w9,w7,w15
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w14,w14,w6
	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	eor	w6,w12,w13
	eor	w9,w14,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w15,w15,w6
	subs	w11,w11,#1
	b.ne	10b
	mov	v4.s[0],w15
	mov	v4.s[1],w14
	mov	v4.s[2],w13
	mov	v4.s[3],w12
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
	st1	{v4.4s},[x1]
	ret
.size	vpsm4_ex_encrypt,.-vpsm4_ex_encrypt
.globl	vpsm4_ex_decrypt
.type	vpsm4_ex_decrypt,%function
.align	5
vpsm4_ex_decrypt:
	AARCH64_VALID_CALL_TARGET
	ld1	{v4.4s},[x0]
	adrp	x9, .Lsbox_magic
	add	x9, x9, #:lo12:.Lsbox_magic
	ldr	q26, [x9]
	ldr	q27, [x9, 16]
	ldr	q28, [x9, 32]
	ldr	q29, [x9, 48]
	ldr	q30, [x9, 64]
	ldr	q31, [x9, 80]
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
	mov	x3,x2
	mov	x10,x3
	mov	w11,#8
	mov	w12,v4.s[0]
	mov	w13,v4.s[1]
	mov	w14,v4.s[2]
	mov	w15,v4.s[3]
10:
	ldp	w7,w8,[x10],8
	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	eor	w6,w14,w15
	eor	w9,w7,w13
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w12,w12,w6
	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	eor	w6,w14,w15
	eor	w9,w12,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	ldp	w7,w8,[x10],8
	eor	w13,w13,w6
	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	eor	w6,w12,w13
	eor	w9,w7,w15
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w14,w14,w6
	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	eor	w6,w12,w13
	eor	w9,w14,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w15,w15,w6
	subs	w11,w11,#1
	b.ne	10b
	mov	v4.s[0],w15
	mov	v4.s[1],w14
	mov	v4.s[2],w13
	mov	v4.s[3],w12
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
	st1	{v4.4s},[x1]
	ret
.size	vpsm4_ex_decrypt,.-vpsm4_ex_decrypt
.globl	vpsm4_ex_ecb_encrypt
.type	vpsm4_ex_ecb_encrypt,%function
.align	5
vpsm4_ex_ecb_encrypt:
	AARCH64_SIGN_LINK_REGISTER
	// convert length into blocks
	lsr	x2,x2,4
	stp	d8,d9,[sp,#-80]!
	stp	d10,d11,[sp,#16]
	stp	d12,d13,[sp,#32]
	stp	d14,d15,[sp,#48]
	stp	x29,x30,[sp,#64]
	adrp	x9, .Lsbox_magic
	add	x9, x9, #:lo12:.Lsbox_magic
	ldr	q26, [x9]
	ldr	q27, [x9, 16]
	ldr	q28, [x9, 32]
	ldr	q29, [x9, 48]
	ldr	q30, [x9, 64]
	ldr	q31, [x9, 80]
.Lecb_8_blocks_process:
	cmp	w2,#8
	b.lt	.Lecb_4_blocks_process
	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
	ld4	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
#ifndef __AARCH64EB__
	rev32	v5.16b,v5.16b
#endif
#ifndef __AARCH64EB__
	rev32	v6.16b,v6.16b
#endif
#ifndef __AARCH64EB__
	rev32	v7.16b,v7.16b
#endif
#ifndef __AARCH64EB__
	rev32	v8.16b,v8.16b
#endif
#ifndef __AARCH64EB__
	rev32	v9.16b,v9.16b
#endif
#ifndef __AARCH64EB__
	rev32	v10.16b,v10.16b
#endif
#ifndef __AARCH64EB__
	rev32	v11.16b,v11.16b
#endif
	bl	_vpsm4_ex_enc_8blks
	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
	st4	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
	subs	w2,w2,#8
	b.gt	.Lecb_8_blocks_process
	b	100f
.Lecb_4_blocks_process:
	cmp	w2,#4
	b.lt	1f
	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
#ifndef __AARCH64EB__
	rev32	v5.16b,v5.16b
#endif
#ifndef __AARCH64EB__
	rev32	v6.16b,v6.16b
#endif
#ifndef __AARCH64EB__
	rev32	v7.16b,v7.16b
#endif
	bl	_vpsm4_ex_enc_4blks
	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
	sub	w2,w2,#4
1:
	// process last block
	cmp	w2,#1
	b.lt	100f
	b.gt	1f
	ld1	{v4.4s},[x0]
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
	mov	x10,x3
	mov	w11,#8
	mov	w12,v4.s[0]
	mov	w13,v4.s[1]
	mov	w14,v4.s[2]
	mov	w15,v4.s[3]
10:
	ldp	w7,w8,[x10],8
	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	eor	w6,w14,w15
	eor	w9,w7,w13
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w12,w12,w6
	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	eor	w6,w14,w15
	eor	w9,w12,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	ldp	w7,w8,[x10],8
	eor	w13,w13,w6
	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	eor	w6,w12,w13
	eor	w9,w7,w15
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w14,w14,w6
	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	eor	w6,w12,w13
	eor	w9,w14,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w15,w15,w6
	subs	w11,w11,#1
	b.ne	10b
	mov	v4.s[0],w15
	mov	v4.s[1],w14
	mov	v4.s[2],w13
	mov	v4.s[3],w12
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
	st1	{v4.4s},[x1]
	b	100f
1:	//	process last 2 blocks
	ld4	{v4.s,v5.s,v6.s,v7.s}[0],[x0],#16
	ld4	{v4.s,v5.s,v6.s,v7.s}[1],[x0],#16
	cmp	w2,#2
	b.gt	1f
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
#ifndef __AARCH64EB__
	rev32	v5.16b,v5.16b
#endif
#ifndef __AARCH64EB__
	rev32	v6.16b,v6.16b
#endif
#ifndef __AARCH64EB__
	rev32	v7.16b,v7.16b
#endif
	bl	_vpsm4_ex_enc_4blks
	st4	{v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
	st4	{v0.s,v1.s,v2.s,v3.s}[1],[x1]
	b	100f
1:	//	process last 3 blocks
	ld4	{v4.s,v5.s,v6.s,v7.s}[2],[x0],#16
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
#ifndef __AARCH64EB__
	rev32	v5.16b,v5.16b
#endif
#ifndef __AARCH64EB__
	rev32	v6.16b,v6.16b
#endif
#ifndef __AARCH64EB__
	rev32	v7.16b,v7.16b
#endif
	bl	_vpsm4_ex_enc_4blks
	st4	{v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
	st4	{v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
	st4	{v0.s,v1.s,v2.s,v3.s}[2],[x1]
100:
	ldp	d10,d11,[sp,#16]
	ldp	d12,d13,[sp,#32]
	ldp	d14,d15,[sp,#48]
	ldp	x29,x30,[sp,#64]
	ldp	d8,d9,[sp],#80
	AARCH64_VALIDATE_LINK_REGISTER
	ret
.size	vpsm4_ex_ecb_encrypt,.-vpsm4_ex_ecb_encrypt
.globl	vpsm4_ex_cbc_encrypt
.type	vpsm4_ex_cbc_encrypt,%function
.align	5
vpsm4_ex_cbc_encrypt:
	AARCH64_VALID_CALL_TARGET
	lsr	x2,x2,4
	adrp	x9, .Lsbox_magic
	add	x9, x9, #:lo12:.Lsbox_magic
	ldr	q26, [x9]
	ldr	q27, [x9, 16]
	ldr	q28, [x9, 32]
	ldr	q29, [x9, 48]
	ldr	q30, [x9, 64]
	ldr	q31, [x9, 80]
	cbz	w5,.Ldec
	ld1	{v3.4s},[x4]
.Lcbc_4_blocks_enc:
	cmp	w2,#4
	b.lt	1f
	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
	eor	v4.16b,v4.16b,v3.16b
#ifndef __AARCH64EB__
	rev32	v5.16b,v5.16b
#endif
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
#ifndef __AARCH64EB__
	rev32	v6.16b,v6.16b
#endif
#ifndef __AARCH64EB__
	rev32	v7.16b,v7.16b
#endif
	mov	x10,x3
	mov	w11,#8
	mov	w12,v4.s[0]
	mov	w13,v4.s[1]
	mov	w14,v4.s[2]
	mov	w15,v4.s[3]
10:
	ldp	w7,w8,[x10],8
	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	eor	w6,w14,w15
	eor	w9,w7,w13
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w12,w12,w6
	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	eor	w6,w14,w15
	eor	w9,w12,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	ldp	w7,w8,[x10],8
	eor	w13,w13,w6
	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	eor	w6,w12,w13
	eor	w9,w7,w15
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w14,w14,w6
	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	eor	w6,w12,w13
	eor	w9,w14,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w15,w15,w6
	subs	w11,w11,#1
	b.ne	10b
	mov	v4.s[0],w15
	mov	v4.s[1],w14
	mov	v4.s[2],w13
	mov	v4.s[3],w12
	eor	v5.16b,v5.16b,v4.16b
	mov	x10,x3
	mov	w11,#8
	mov	w12,v5.s[0]
	mov	w13,v5.s[1]
	mov	w14,v5.s[2]
	mov	w15,v5.s[3]
10:
	ldp	w7,w8,[x10],8
	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	eor	w6,w14,w15
	eor	w9,w7,w13
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w12,w12,w6
	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	eor	w6,w14,w15
	eor	w9,w12,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	ldp	w7,w8,[x10],8
	eor	w13,w13,w6
	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	eor	w6,w12,w13
	eor	w9,w7,w15
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w14,w14,w6
	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	eor	w6,w12,w13
	eor	w9,w14,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w15,w15,w6
	subs	w11,w11,#1
	b.ne	10b
	mov	v5.s[0],w15
	mov	v5.s[1],w14
	mov	v5.s[2],w13
	mov	v5.s[3],w12
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
	eor	v6.16b,v6.16b,v5.16b
	mov	x10,x3
	mov	w11,#8
	mov	w12,v6.s[0]
	mov	w13,v6.s[1]
	mov	w14,v6.s[2]
	mov	w15,v6.s[3]
10:
	ldp	w7,w8,[x10],8
	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	eor	w6,w14,w15
	eor	w9,w7,w13
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w12,w12,w6
	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	eor	w6,w14,w15
	eor	w9,w12,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	ldp	w7,w8,[x10],8
	eor	w13,w13,w6
	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	eor	w6,w12,w13
	eor	w9,w7,w15
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w14,w14,w6
	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	eor	w6,w12,w13
	eor	w9,w14,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w15,w15,w6
	subs	w11,w11,#1
	b.ne	10b
	mov	v6.s[0],w15
	mov	v6.s[1],w14
	mov	v6.s[2],w13
	mov	v6.s[3],w12
#ifndef __AARCH64EB__
	rev32	v5.16b,v5.16b
#endif
	eor	v7.16b,v7.16b,v6.16b
	mov	x10,x3
	mov	w11,#8
	mov	w12,v7.s[0]
	mov	w13,v7.s[1]
	mov	w14,v7.s[2]
	mov	w15,v7.s[3]
10:
	ldp	w7,w8,[x10],8
	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	eor	w6,w14,w15
	eor	w9,w7,w13
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w12,w12,w6
	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	eor	w6,w14,w15
	eor	w9,w12,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	ldp	w7,w8,[x10],8
	eor	w13,w13,w6
	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	eor	w6,w12,w13
	eor	w9,w7,w15
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w14,w14,w6
	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	eor	w6,w12,w13
	eor	w9,w14,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w15,w15,w6
	subs	w11,w11,#1
	b.ne	10b
	mov	v7.s[0],w15
	mov	v7.s[1],w14
	mov	v7.s[2],w13
	mov	v7.s[3],w12
#ifndef __AARCH64EB__
	rev32	v6.16b,v6.16b
#endif
#ifndef __AARCH64EB__
	rev32	v7.16b,v7.16b
#endif
	orr	v3.16b,v7.16b,v7.16b
	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
	subs	w2,w2,#4
	b.ne	.Lcbc_4_blocks_enc
	b	2f
1:
	subs	w2,w2,#1
	b.lt	2f
	ld1	{v4.4s},[x0],#16
	eor	v3.16b,v3.16b,v4.16b
#ifndef __AARCH64EB__
	rev32	v3.16b,v3.16b
#endif
	mov	x10,x3
	mov	w11,#8
	mov	w12,v3.s[0]
	mov	w13,v3.s[1]
	mov	w14,v3.s[2]
	mov	w15,v3.s[3]
10:
	ldp	w7,w8,[x10],8
	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	eor	w6,w14,w15
	eor	w9,w7,w13
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w12,w12,w6
	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	eor	w6,w14,w15
	eor	w9,w12,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	ldp	w7,w8,[x10],8
	eor	w13,w13,w6
	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	eor	w6,w12,w13
	eor	w9,w7,w15
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w14,w14,w6
	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	eor	w6,w12,w13
	eor	w9,w14,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w15,w15,w6
	subs	w11,w11,#1
	b.ne	10b
	mov	v3.s[0],w15
	mov	v3.s[1],w14
	mov	v3.s[2],w13
	mov	v3.s[3],w12
#ifndef __AARCH64EB__
	rev32	v3.16b,v3.16b
#endif
	st1	{v3.4s},[x1],#16
	b	1b
2:
	// save back IV
	st1	{v3.4s},[x4]
	ret

.Ldec:
	// decryption mode starts
	AARCH64_SIGN_LINK_REGISTER
	stp	d8,d9,[sp,#-80]!
	stp	d10,d11,[sp,#16]
	stp	d12,d13,[sp,#32]
	stp	d14,d15,[sp,#48]
	stp	x29,x30,[sp,#64]
.Lcbc_8_blocks_dec:
	cmp	w2,#8
	b.lt	1f
	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[x0]
	add	x10,x0,#64
	ld4	{v8.4s,v9.4s,v10.4s,v11.4s},[x10]
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
#ifndef __AARCH64EB__
	rev32	v5.16b,v5.16b
#endif
#ifndef __AARCH64EB__
	rev32	v6.16b,v6.16b
#endif
#ifndef __AARCH64EB__
	rev32	v7.16b,v7.16b
#endif
#ifndef __AARCH64EB__
	rev32	v8.16b,v8.16b
#endif
#ifndef __AARCH64EB__
	rev32	v9.16b,v9.16b
#endif
#ifndef __AARCH64EB__
	rev32	v10.16b,v10.16b
#endif
#ifndef __AARCH64EB__
	rev32	v11.16b,v11.16b
#endif
	bl	_vpsm4_ex_enc_8blks
	zip1	v8.4s,v0.4s,v1.4s
	zip2	v9.4s,v0.4s,v1.4s
	zip1	v10.4s,v2.4s,v3.4s
	zip2	v11.4s,v2.4s,v3.4s
	zip1	v0.2d,v8.2d,v10.2d
	zip2	v1.2d,v8.2d,v10.2d
	zip1	v2.2d,v9.2d,v11.2d
	zip2	v3.2d,v9.2d,v11.2d
	zip1	v8.4s,v4.4s,v5.4s
	zip2	v9.4s,v4.4s,v5.4s
	zip1	v10.4s,v6.4s,v7.4s
	zip2	v11.4s,v6.4s,v7.4s
	zip1	v4.2d,v8.2d,v10.2d
	zip2	v5.2d,v8.2d,v10.2d
	zip1	v6.2d,v9.2d,v11.2d
	zip2	v7.2d,v9.2d,v11.2d
	ld1	{v15.4s},[x4]
	ld1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
	// note ivec1 and vtmpx[3] are reusing the same register
	// care needs to be taken to avoid conflict
	eor	v0.16b,v0.16b,v15.16b
	ld1	{v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
	eor	v1.16b,v1.16b,v8.16b
	eor	v2.16b,v2.16b,v9.16b
	eor	v3.16b,v3.16b,v10.16b
	// save back IV
	st1	{v15.4s}, [x4]
	eor	v4.16b,v4.16b,v11.16b
	eor	v5.16b,v5.16b,v12.16b
	eor	v6.16b,v6.16b,v13.16b
	eor	v7.16b,v7.16b,v14.16b
	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
	subs	w2,w2,#8
	b.gt	.Lcbc_8_blocks_dec
	b.eq	100f
1:
	ld1	{v15.4s},[x4]
.Lcbc_4_blocks_dec:
	cmp	w2,#4
	b.lt	1f
	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[x0]
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
#ifndef __AARCH64EB__
	rev32	v5.16b,v5.16b
#endif
#ifndef __AARCH64EB__
	rev32	v6.16b,v6.16b
#endif
#ifndef __AARCH64EB__
	rev32	v7.16b,v7.16b
#endif
	bl	_vpsm4_ex_enc_4blks
	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
	zip1	v8.4s,v0.4s,v1.4s
	zip2	v9.4s,v0.4s,v1.4s
	zip1	v10.4s,v2.4s,v3.4s
	zip2	v11.4s,v2.4s,v3.4s
	zip1	v0.2d,v8.2d,v10.2d
	zip2	v1.2d,v8.2d,v10.2d
	zip1	v2.2d,v9.2d,v11.2d
	zip2	v3.2d,v9.2d,v11.2d
	eor	v0.16b,v0.16b,v15.16b
	eor	v1.16b,v1.16b,v4.16b
	orr	v15.16b,v7.16b,v7.16b
	eor	v2.16b,v2.16b,v5.16b
	eor	v3.16b,v3.16b,v6.16b
	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
	subs	w2,w2,#4
	b.gt	.Lcbc_4_blocks_dec
	// save back IV
	st1	{v7.4s}, [x4]
	b	100f
1:	//	last block
	subs	w2,w2,#1
	b.lt	100f
	b.gt	1f
	ld1	{v4.4s},[x0],#16
	// save back IV
	st1	{v4.4s}, [x4]
#ifndef __AARCH64EB__
	rev32	v8.16b,v4.16b
#else
	mov	v8.16b,v4.16b
#endif
	mov	x10,x3
	mov	w11,#8
	mov	w12,v8.s[0]
	mov	w13,v8.s[1]
	mov	w14,v8.s[2]
	mov	w15,v8.s[3]
10:
	ldp	w7,w8,[x10],8
	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	eor	w6,w14,w15
	eor	w9,w7,w13
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w12,w12,w6
	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	eor	w6,w14,w15
	eor	w9,w12,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	ldp	w7,w8,[x10],8
	eor	w13,w13,w6
	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	eor	w6,w12,w13
	eor	w9,w7,w15
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w14,w14,w6
	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	eor	w6,w12,w13
	eor	w9,w14,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w15,w15,w6
	subs	w11,w11,#1
	b.ne	10b
	mov	v8.s[0],w15
	mov	v8.s[1],w14
	mov	v8.s[2],w13
	mov	v8.s[3],w12
#ifndef __AARCH64EB__
	rev32	v8.16b,v8.16b
#endif
	eor	v8.16b,v8.16b,v15.16b
	st1	{v8.4s},[x1],#16
	b	100f
1:	//	last two blocks
	ld4	{v4.s,v5.s,v6.s,v7.s}[0],[x0]
	add	x10,x0,#16
	ld4	{v4.s,v5.s,v6.s,v7.s}[1],[x10],#16
	subs	w2,w2,1
	b.gt	1f
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
#ifndef __AARCH64EB__
	rev32	v5.16b,v5.16b
#endif
#ifndef __AARCH64EB__
	rev32	v6.16b,v6.16b
#endif
#ifndef __AARCH64EB__
	rev32	v7.16b,v7.16b
#endif
	bl	_vpsm4_ex_enc_4blks
	ld1	{v4.4s,v5.4s},[x0],#32
	zip1	v8.4s,v0.4s,v1.4s
	zip2	v9.4s,v0.4s,v1.4s
	zip1	v10.4s,v2.4s,v3.4s
	zip2	v11.4s,v2.4s,v3.4s
	zip1	v0.2d,v8.2d,v10.2d
	zip2	v1.2d,v8.2d,v10.2d
	zip1	v2.2d,v9.2d,v11.2d
	zip2	v3.2d,v9.2d,v11.2d
	eor	v0.16b,v0.16b,v15.16b
	eor	v1.16b,v1.16b,v4.16b
	st1	{v0.4s,v1.4s},[x1],#32
	// save back IV
	st1	{v5.4s}, [x4]
	b	100f
1:	//	last 3 blocks
	ld4	{v4.s,v5.s,v6.s,v7.s}[2],[x10]
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
#ifndef __AARCH64EB__
	rev32	v5.16b,v5.16b
#endif
#ifndef __AARCH64EB__
	rev32	v6.16b,v6.16b
#endif
#ifndef __AARCH64EB__
	rev32	v7.16b,v7.16b
#endif
	bl	_vpsm4_ex_enc_4blks
	ld1	{v4.4s,v5.4s,v6.4s},[x0],#48
	zip1	v8.4s,v0.4s,v1.4s
	zip2	v9.4s,v0.4s,v1.4s
	zip1	v10.4s,v2.4s,v3.4s
	zip2	v11.4s,v2.4s,v3.4s
	zip1	v0.2d,v8.2d,v10.2d
	zip2	v1.2d,v8.2d,v10.2d
	zip1	v2.2d,v9.2d,v11.2d
	zip2	v3.2d,v9.2d,v11.2d
	eor	v0.16b,v0.16b,v15.16b
	eor	v1.16b,v1.16b,v4.16b
	eor	v2.16b,v2.16b,v5.16b
	st1	{v0.4s,v1.4s,v2.4s},[x1],#48
	// save back IV
	st1	{v6.4s}, [x4]
100:
	ldp	d10,d11,[sp,#16]
	ldp	d12,d13,[sp,#32]
	ldp	d14,d15,[sp,#48]
	ldp	x29,x30,[sp,#64]
	ldp	d8,d9,[sp],#80
	AARCH64_VALIDATE_LINK_REGISTER
	ret
.size	vpsm4_ex_cbc_encrypt,.-vpsm4_ex_cbc_encrypt
.globl	vpsm4_ex_ctr32_encrypt_blocks
.type	vpsm4_ex_ctr32_encrypt_blocks,%function
.align	5
vpsm4_ex_ctr32_encrypt_blocks:
	AARCH64_VALID_CALL_TARGET
	ld1	{v3.4s},[x4]
#ifndef __AARCH64EB__
	rev32	v3.16b,v3.16b
#endif
	adrp	x9, .Lsbox_magic
	add	x9, x9, #:lo12:.Lsbox_magic
	ldr	q26, [x9]
	ldr	q27, [x9, 16]
	ldr	q28, [x9, 32]
	ldr	q29, [x9, 48]
	ldr	q30, [x9, 64]
	ldr	q31, [x9, 80]
	cmp	w2,#1
	b.ne	1f
	// fast processing for one single block without
	// context saving overhead
	mov	x10,x3
	mov	w11,#8
	mov	w12,v3.s[0]
	mov	w13,v3.s[1]
	mov	w14,v3.s[2]
	mov	w15,v3.s[3]
10:
	ldp	w7,w8,[x10],8
	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	eor	w6,w14,w15
	eor	w9,w7,w13
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w12,w12,w6
	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	eor	w6,w14,w15
	eor	w9,w12,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	ldp	w7,w8,[x10],8
	eor	w13,w13,w6
	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	eor	w6,w12,w13
	eor	w9,w7,w15
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w14,w14,w6
	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	eor	w6,w12,w13
	eor	w9,w14,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w15,w15,w6
	subs	w11,w11,#1
	b.ne	10b
	mov	v3.s[0],w15
	mov	v3.s[1],w14
	mov	v3.s[2],w13
	mov	v3.s[3],w12
#ifndef __AARCH64EB__
	rev32	v3.16b,v3.16b
#endif
	ld1	{v4.4s},[x0]
	eor	v4.16b,v4.16b,v3.16b
	st1	{v4.4s},[x1]
	ret
1:
	AARCH64_SIGN_LINK_REGISTER
	stp	d8,d9,[sp,#-80]!
	stp	d10,d11,[sp,#16]
	stp	d12,d13,[sp,#32]
	stp	d14,d15,[sp,#48]
	stp	x29,x30,[sp,#64]
	mov	w12,v3.s[0]
	mov	w13,v3.s[1]
	mov	w14,v3.s[2]
	mov	w5,v3.s[3]
.Lctr32_4_blocks_process:
	cmp	w2,#4
	b.lt	1f
	dup	v4.4s,w12
	dup	v5.4s,w13
	dup	v6.4s,w14
	mov	v7.s[0],w5
	add	w5,w5,#1
	mov	v7.s[1],w5
	add	w5,w5,#1
	mov	v7.s[2],w5
	add	w5,w5,#1
	mov	v7.s[3],w5
	add	w5,w5,#1
	cmp	w2,#8
	b.ge	.Lctr32_8_blocks_process
	bl	_vpsm4_ex_enc_4blks
	ld4	{v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
	eor	v0.16b,v0.16b,v12.16b
	eor	v1.16b,v1.16b,v13.16b
	eor	v2.16b,v2.16b,v14.16b
	eor	v3.16b,v3.16b,v15.16b
	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
	subs	w2,w2,#4
	b.ne	.Lctr32_4_blocks_process
	b	100f
.Lctr32_8_blocks_process:
	dup	v8.4s,w12
	dup	v9.4s,w13
	dup	v10.4s,w14
	mov	v11.s[0],w5
	add	w5,w5,#1
	mov	v11.s[1],w5
	add	w5,w5,#1
	mov	v11.s[2],w5
	add	w5,w5,#1
	mov	v11.s[3],w5
	add	w5,w5,#1
	bl	_vpsm4_ex_enc_8blks
	ld4	{v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
	ld4	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
	eor	v0.16b,v0.16b,v12.16b
	eor	v1.16b,v1.16b,v13.16b
	eor	v2.16b,v2.16b,v14.16b
	eor	v3.16b,v3.16b,v15.16b
	eor	v4.16b,v4.16b,v8.16b
	eor	v5.16b,v5.16b,v9.16b
	eor	v6.16b,v6.16b,v10.16b
	eor	v7.16b,v7.16b,v11.16b
	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
	st4	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
	subs	w2,w2,#8
	b.ne	.Lctr32_4_blocks_process
	b	100f
1:	//	last block processing
	subs	w2,w2,#1
	b.lt	100f
	b.gt	1f
	mov	v3.s[0],w12
	mov	v3.s[1],w13
	mov	v3.s[2],w14
	mov	v3.s[3],w5
	mov	x10,x3
	mov	w11,#8
	mov	w12,v3.s[0]
	mov	w13,v3.s[1]
	mov	w14,v3.s[2]
	mov	w15,v3.s[3]
10:
	ldp	w7,w8,[x10],8
	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	eor	w6,w14,w15
	eor	w9,w7,w13
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w12,w12,w6
	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	eor	w6,w14,w15
	eor	w9,w12,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	ldp	w7,w8,[x10],8
	eor	w13,w13,w6
	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	eor	w6,w12,w13
	eor	w9,w7,w15
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w14,w14,w6
	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	eor	w6,w12,w13
	eor	w9,w14,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w15,w15,w6
	subs	w11,w11,#1
	b.ne	10b
	mov	v3.s[0],w15
	mov	v3.s[1],w14
	mov	v3.s[2],w13
	mov	v3.s[3],w12
#ifndef __AARCH64EB__
	rev32	v3.16b,v3.16b
#endif
	ld1	{v4.4s},[x0]
	eor	v4.16b,v4.16b,v3.16b
	st1	{v4.4s},[x1]
	b	100f
1:	//	last 2 blocks processing
	dup	v4.4s,w12
	dup	v5.4s,w13
	dup	v6.4s,w14
	mov	v7.s[0],w5
	add	w5,w5,#1
	mov	v7.s[1],w5
	subs	w2,w2,#1
	b.ne	1f
	bl	_vpsm4_ex_enc_4blks
	ld4	{v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
	ld4	{v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
	eor	v0.16b,v0.16b,v12.16b
	eor	v1.16b,v1.16b,v13.16b
	eor	v2.16b,v2.16b,v14.16b
	eor	v3.16b,v3.16b,v15.16b
	st4	{v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
	st4	{v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
	b	100f
1:	//	last 3 blocks processing
	add	w5,w5,#1
	mov	v7.s[2],w5
	bl	_vpsm4_ex_enc_4blks
	ld4	{v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
	ld4	{v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
	ld4	{v12.s,v13.s,v14.s,v15.s}[2],[x0],#16
	eor	v0.16b,v0.16b,v12.16b
	eor	v1.16b,v1.16b,v13.16b
	eor	v2.16b,v2.16b,v14.16b
	eor	v3.16b,v3.16b,v15.16b
	st4	{v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
	st4	{v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
	st4	{v0.s,v1.s,v2.s,v3.s}[2],[x1],#16
100:
	ldp	d10,d11,[sp,#16]
	ldp	d12,d13,[sp,#32]
	ldp	d14,d15,[sp,#48]
	ldp	x29,x30,[sp,#64]
	ldp	d8,d9,[sp],#80
	AARCH64_VALIDATE_LINK_REGISTER
	ret
.size	vpsm4_ex_ctr32_encrypt_blocks,.-vpsm4_ex_ctr32_encrypt_blocks
.globl	vpsm4_ex_xts_encrypt_gb
.type	vpsm4_ex_xts_encrypt_gb,%function
.align	5
vpsm4_ex_xts_encrypt_gb:
	AARCH64_SIGN_LINK_REGISTER
	stp	x15, x16, [sp, #-0x10]!
	stp	x17, x18, [sp, #-0x10]!
	stp	x19, x20, [sp, #-0x10]!
	stp	x21, x22, [sp, #-0x10]!
	stp	x23, x24, [sp, #-0x10]!
	stp	x25, x26, [sp, #-0x10]!
	stp	x27, x28, [sp, #-0x10]!
	stp	x29, x30, [sp, #-0x10]!
	stp	d8, d9, [sp, #-0x10]!
	stp	d10, d11, [sp, #-0x10]!
	stp	d12, d13, [sp, #-0x10]!
	stp	d14, d15, [sp, #-0x10]!
	mov	x26,x3
	mov	x27,x4
	mov	w28,w6
	ld1	{v16.4s}, [x5]
	mov	x3,x27
	adrp	x9, .Lsbox_magic
	add	x9, x9, #:lo12:.Lsbox_magic
	ldr	q26, [x9]
	ldr	q27, [x9, 16]
	ldr	q28, [x9, 32]
	ldr	q29, [x9, 48]
	ldr	q30, [x9, 64]
	ldr	q31, [x9, 80]
#ifndef __AARCH64EB__
	rev32	v16.16b,v16.16b
#endif
	mov	x10,x3
	mov	w11,#8
	mov	w12,v16.s[0]
	mov	w13,v16.s[1]
	mov	w14,v16.s[2]
	mov	w15,v16.s[3]
10:
	ldp	w7,w8,[x10],8
	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	eor	w6,w14,w15
	eor	w9,w7,w13
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w12,w12,w6
	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	eor	w6,w14,w15
	eor	w9,w12,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	ldp	w7,w8,[x10],8
	eor	w13,w13,w6
	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	eor	w6,w12,w13
	eor	w9,w7,w15
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w14,w14,w6
	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	eor	w6,w12,w13
	eor	w9,w14,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w15,w15,w6
	subs	w11,w11,#1
	b.ne	10b
	mov	v16.s[0],w15
	mov	v16.s[1],w14
	mov	v16.s[2],w13
	mov	v16.s[3],w12
#ifndef __AARCH64EB__
	rev32	v16.16b,v16.16b
#endif
	mov	x3,x26
	and	x29,x2,#0x0F
	// convert length into blocks
	lsr	x2,x2,4
	cmp	x2,#1
	b.lt	.return_gb

	cmp	x29,0
	// If the encryption/decryption Length is N times of 16,
	// the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
	b.eq	.xts_encrypt_blocks_gb

	// If the encryption/decryption length is not N times of 16,
	// the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb
	// the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
	subs	x2,x2,#1
	b.eq	.only_2blks_tweak_gb
.xts_encrypt_blocks_gb:
	rbit	v16.16b,v16.16b
#ifdef __AARCH64EB__
	rev32	v16.16b,v16.16b
#endif
	mov	x12,v16.d[0]
	mov	x13,v16.d[1]
	mov	w7,0x87
	extr	x9,x13,x13,#32
	extr	x15,x13,x12,#63
	and	w8,w7,w9,asr#31
	eor	x14,x8,x12,lsl#1
	mov	w7,0x87
	extr	x9,x15,x15,#32
	extr	x17,x15,x14,#63
	and	w8,w7,w9,asr#31
	eor	x16,x8,x14,lsl#1
	mov	w7,0x87
	extr	x9,x17,x17,#32
	extr	x19,x17,x16,#63
	and	w8,w7,w9,asr#31
	eor	x18,x8,x16,lsl#1
	mov	w7,0x87
	extr	x9,x19,x19,#32
	extr	x21,x19,x18,#63
	and	w8,w7,w9,asr#31
	eor	x20,x8,x18,lsl#1
	mov	w7,0x87
	extr	x9,x21,x21,#32
	extr	x23,x21,x20,#63
	and	w8,w7,w9,asr#31
	eor	x22,x8,x20,lsl#1
	mov	w7,0x87
	extr	x9,x23,x23,#32
	extr	x25,x23,x22,#63
	and	w8,w7,w9,asr#31
	eor	x24,x8,x22,lsl#1
	mov	w7,0x87
	extr	x9,x25,x25,#32
	extr	x27,x25,x24,#63
	and	w8,w7,w9,asr#31
	eor	x26,x8,x24,lsl#1
.Lxts_8_blocks_process_gb:
	cmp	x2,#8
	mov	v16.d[0],x12
	mov	v16.d[1],x13
#ifdef __AARCH64EB__
	rev32	v16.16b,v16.16b
#endif
	mov	w7,0x87
	extr	x9,x27,x27,#32
	extr	x13,x27,x26,#63
	and	w8,w7,w9,asr#31
	eor	x12,x8,x26,lsl#1
	mov	v17.d[0],x14
	mov	v17.d[1],x15
#ifdef __AARCH64EB__
	rev32	v17.16b,v17.16b
#endif
	mov	w7,0x87
	extr	x9,x13,x13,#32
	extr	x15,x13,x12,#63
	and	w8,w7,w9,asr#31
	eor	x14,x8,x12,lsl#1
	mov	v18.d[0],x16
	mov	v18.d[1],x17
#ifdef __AARCH64EB__
	rev32	v18.16b,v18.16b
#endif
	mov	w7,0x87
	extr	x9,x15,x15,#32
	extr	x17,x15,x14,#63
	and	w8,w7,w9,asr#31
	eor	x16,x8,x14,lsl#1
	mov	v19.d[0],x18
	mov	v19.d[1],x19
#ifdef __AARCH64EB__
	rev32	v19.16b,v19.16b
#endif
	mov	w7,0x87
	extr	x9,x17,x17,#32
	extr	x19,x17,x16,#63
	and	w8,w7,w9,asr#31
	eor	x18,x8,x16,lsl#1
	mov	v20.d[0],x20
	mov	v20.d[1],x21
#ifdef __AARCH64EB__
	rev32	v20.16b,v20.16b
#endif
	mov	w7,0x87
	extr	x9,x19,x19,#32
	extr	x21,x19,x18,#63
	and	w8,w7,w9,asr#31
	eor	x20,x8,x18,lsl#1
	mov	v21.d[0],x22
	mov	v21.d[1],x23
#ifdef __AARCH64EB__
	rev32	v21.16b,v21.16b
#endif
	mov	w7,0x87
	extr	x9,x21,x21,#32
	extr	x23,x21,x20,#63
	and	w8,w7,w9,asr#31
	eor	x22,x8,x20,lsl#1
	mov	v22.d[0],x24
	mov	v22.d[1],x25
#ifdef __AARCH64EB__
	rev32	v22.16b,v22.16b
#endif
	mov	w7,0x87
	extr	x9,x23,x23,#32
	extr	x25,x23,x22,#63
	and	w8,w7,w9,asr#31
	eor	x24,x8,x22,lsl#1
	mov	v23.d[0],x26
	mov	v23.d[1],x27
#ifdef __AARCH64EB__
	rev32	v23.16b,v23.16b
#endif
	mov	w7,0x87
	extr	x9,x25,x25,#32
	extr	x27,x25,x24,#63
	and	w8,w7,w9,asr#31
	eor	x26,x8,x24,lsl#1
	b.lt	.Lxts_4_blocks_process_gb
	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
	rbit	v16.16b,v16.16b
	rbit	v17.16b,v17.16b
	rbit	v18.16b,v18.16b
	rbit	v19.16b,v19.16b
	eor	v4.16b, v4.16b, v16.16b
	eor	v5.16b, v5.16b, v17.16b
	eor	v6.16b, v6.16b, v18.16b
	eor	v7.16b, v7.16b, v19.16b
	ld1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
	rbit	v20.16b,v20.16b
	rbit	v21.16b,v21.16b
	rbit	v22.16b,v22.16b
	rbit	v23.16b,v23.16b
	eor	v8.16b, v8.16b, v20.16b
	eor	v9.16b, v9.16b, v21.16b
	eor	v10.16b, v10.16b, v22.16b
	eor	v11.16b, v11.16b, v23.16b
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
#ifndef __AARCH64EB__
	rev32	v5.16b,v5.16b
#endif
#ifndef __AARCH64EB__
	rev32	v6.16b,v6.16b
#endif
#ifndef __AARCH64EB__
	rev32	v7.16b,v7.16b
#endif
#ifndef __AARCH64EB__
	rev32	v8.16b,v8.16b
#endif
#ifndef __AARCH64EB__
	rev32	v9.16b,v9.16b
#endif
#ifndef __AARCH64EB__
	rev32	v10.16b,v10.16b
#endif
#ifndef __AARCH64EB__
	rev32	v11.16b,v11.16b
#endif
	zip1	v0.4s,v4.4s,v5.4s
	zip2	v1.4s,v4.4s,v5.4s
	zip1	v2.4s,v6.4s,v7.4s
	zip2	v3.4s,v6.4s,v7.4s
	zip1	v4.2d,v0.2d,v2.2d
	zip2	v5.2d,v0.2d,v2.2d
	zip1	v6.2d,v1.2d,v3.2d
	zip2	v7.2d,v1.2d,v3.2d
	zip1	v0.4s,v8.4s,v9.4s
	zip2	v1.4s,v8.4s,v9.4s
	zip1	v2.4s,v10.4s,v11.4s
	zip2	v3.4s,v10.4s,v11.4s
	zip1	v8.2d,v0.2d,v2.2d
	zip2	v9.2d,v0.2d,v2.2d
	zip1	v10.2d,v1.2d,v3.2d
	zip2	v11.2d,v1.2d,v3.2d
	bl	_vpsm4_ex_enc_8blks
	zip1	v8.4s,v0.4s,v1.4s
	zip2	v9.4s,v0.4s,v1.4s
	zip1	v10.4s,v2.4s,v3.4s
	zip2	v11.4s,v2.4s,v3.4s
	zip1	v0.2d,v8.2d,v10.2d
	zip2	v1.2d,v8.2d,v10.2d
	zip1	v2.2d,v9.2d,v11.2d
	zip2	v3.2d,v9.2d,v11.2d
	zip1	v8.4s,v4.4s,v5.4s
	zip2	v9.4s,v4.4s,v5.4s
	zip1	v10.4s,v6.4s,v7.4s
	zip2	v11.4s,v6.4s,v7.4s
	zip1	v4.2d,v8.2d,v10.2d
	zip2	v5.2d,v8.2d,v10.2d
	zip1	v6.2d,v9.2d,v11.2d
	zip2	v7.2d,v9.2d,v11.2d
	eor	v0.16b, v0.16b, v16.16b
	eor	v1.16b, v1.16b, v17.16b
	eor	v2.16b, v2.16b, v18.16b
	eor	v3.16b, v3.16b, v19.16b
	eor	v4.16b, v4.16b, v20.16b
	eor	v5.16b, v5.16b, v21.16b
	eor	v6.16b, v6.16b, v22.16b
	eor	v7.16b, v7.16b, v23.16b

	// save the last tweak
	mov	v25.16b,v23.16b
	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
	subs	x2,x2,#8
	b.gt	.Lxts_8_blocks_process_gb
	b	100f
.Lxts_4_blocks_process_gb:
	cmp	x2,#4
	b.lt	1f
	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
	rbit	v16.16b,v16.16b
	rbit	v17.16b,v17.16b
	rbit	v18.16b,v18.16b
	rbit	v19.16b,v19.16b
	eor	v4.16b, v4.16b, v16.16b
	eor	v5.16b, v5.16b, v17.16b
	eor	v6.16b, v6.16b, v18.16b
	eor	v7.16b, v7.16b, v19.16b
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
#ifndef __AARCH64EB__
	rev32	v5.16b,v5.16b
#endif
#ifndef __AARCH64EB__
	rev32	v6.16b,v6.16b
#endif
#ifndef __AARCH64EB__
	rev32	v7.16b,v7.16b
#endif
	zip1	v0.4s,v4.4s,v5.4s
	zip2	v1.4s,v4.4s,v5.4s
	zip1	v2.4s,v6.4s,v7.4s
	zip2	v3.4s,v6.4s,v7.4s
	zip1	v4.2d,v0.2d,v2.2d
	zip2	v5.2d,v0.2d,v2.2d
	zip1	v6.2d,v1.2d,v3.2d
	zip2	v7.2d,v1.2d,v3.2d
	bl	_vpsm4_ex_enc_4blks
	zip1	v4.4s,v0.4s,v1.4s
	zip2	v5.4s,v0.4s,v1.4s
	zip1	v6.4s,v2.4s,v3.4s
	zip2	v7.4s,v2.4s,v3.4s
	zip1	v0.2d,v4.2d,v6.2d
	zip2	v1.2d,v4.2d,v6.2d
	zip1	v2.2d,v5.2d,v7.2d
	zip2	v3.2d,v5.2d,v7.2d
	eor	v0.16b, v0.16b, v16.16b
	eor	v1.16b, v1.16b, v17.16b
	eor	v2.16b, v2.16b, v18.16b
	eor	v3.16b, v3.16b, v19.16b
	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
	sub	x2,x2,#4
	mov	v16.16b,v20.16b
	mov	v17.16b,v21.16b
	mov	v18.16b,v22.16b
	// save the last tweak
	mov	v25.16b,v19.16b
1:
	// process last block
	cmp	x2,#1
	b.lt	100f
	b.gt	1f
	ld1	{v4.4s},[x0],#16
	rbit	v16.16b,v16.16b
	eor	v4.16b, v4.16b, v16.16b
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
	mov	x10,x3
	mov	w11,#8
	mov	w12,v4.s[0]
	mov	w13,v4.s[1]
	mov	w14,v4.s[2]
	mov	w15,v4.s[3]
10:
	ldp	w7,w8,[x10],8
	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	eor	w6,w14,w15
	eor	w9,w7,w13
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w12,w12,w6
	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	eor	w6,w14,w15
	eor	w9,w12,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	ldp	w7,w8,[x10],8
	eor	w13,w13,w6
	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	eor	w6,w12,w13
	eor	w9,w7,w15
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w14,w14,w6
	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	eor	w6,w12,w13
	eor	w9,w14,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w15,w15,w6
	subs	w11,w11,#1
	b.ne	10b
	mov	v4.s[0],w15
	mov	v4.s[1],w14
	mov	v4.s[2],w13
	mov	v4.s[3],w12
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
	eor	v4.16b, v4.16b, v16.16b
	st1	{v4.4s},[x1],#16
	// save the last tweak
	mov	v25.16b,v16.16b
	b	100f
1:	//	process last 2 blocks
	cmp	x2,#2
	b.gt	1f
	ld1	{v4.4s,v5.4s},[x0],#32
	rbit	v16.16b,v16.16b
	rbit	v17.16b,v17.16b
	eor	v4.16b, v4.16b, v16.16b
	eor	v5.16b, v5.16b, v17.16b
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
#ifndef __AARCH64EB__
	rev32	v5.16b,v5.16b
#endif
	zip1	v0.4s,v4.4s,v5.4s
	zip2	v1.4s,v4.4s,v5.4s
	zip1	v2.4s,v6.4s,v7.4s
	zip2	v3.4s,v6.4s,v7.4s
	zip1	v4.2d,v0.2d,v2.2d
	zip2	v5.2d,v0.2d,v2.2d
	zip1	v6.2d,v1.2d,v3.2d
	zip2	v7.2d,v1.2d,v3.2d
	bl	_vpsm4_ex_enc_4blks
	zip1	v4.4s,v0.4s,v1.4s
	zip2	v5.4s,v0.4s,v1.4s
	zip1	v6.4s,v2.4s,v3.4s
	zip2	v7.4s,v2.4s,v3.4s
	zip1	v0.2d,v4.2d,v6.2d
	zip2	v1.2d,v4.2d,v6.2d
	zip1	v2.2d,v5.2d,v7.2d
	zip2	v3.2d,v5.2d,v7.2d
	eor	v0.16b, v0.16b, v16.16b
	eor	v1.16b, v1.16b, v17.16b
	st1	{v0.4s,v1.4s},[x1],#32
	// save the last tweak
	mov	v25.16b,v17.16b
	b	100f
1:	//	process last 3 blocks
	ld1	{v4.4s,v5.4s,v6.4s},[x0],#48
	rbit	v16.16b,v16.16b
	rbit	v17.16b,v17.16b
	rbit	v18.16b,v18.16b
	eor	v4.16b, v4.16b, v16.16b
	eor	v5.16b, v5.16b, v17.16b
	eor	v6.16b, v6.16b, v18.16b
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
#ifndef __AARCH64EB__
	rev32	v5.16b,v5.16b
#endif
#ifndef __AARCH64EB__
	rev32	v6.16b,v6.16b
#endif
	zip1	v0.4s,v4.4s,v5.4s
	zip2	v1.4s,v4.4s,v5.4s
	zip1	v2.4s,v6.4s,v7.4s
	zip2	v3.4s,v6.4s,v7.4s
	zip1	v4.2d,v0.2d,v2.2d
	zip2	v5.2d,v0.2d,v2.2d
	zip1	v6.2d,v1.2d,v3.2d
	zip2	v7.2d,v1.2d,v3.2d
	bl	_vpsm4_ex_enc_4blks
	zip1	v4.4s,v0.4s,v1.4s
	zip2	v5.4s,v0.4s,v1.4s
	zip1	v6.4s,v2.4s,v3.4s
	zip2	v7.4s,v2.4s,v3.4s
	zip1	v0.2d,v4.2d,v6.2d
	zip2	v1.2d,v4.2d,v6.2d
	zip1	v2.2d,v5.2d,v7.2d
	zip2	v3.2d,v5.2d,v7.2d
	eor	v0.16b, v0.16b, v16.16b
	eor	v1.16b, v1.16b, v17.16b
	eor	v2.16b, v2.16b, v18.16b
	st1	{v0.4s,v1.4s,v2.4s},[x1],#48
	// save the last tweak
	mov	v25.16b,v18.16b
100:
	cmp	x29,0
	b.eq	.return_gb

// This branch calculates the last two tweaks, 
// while the encryption/decryption length is larger than 32
.last_2blks_tweak_gb:
#ifdef __AARCH64EB__
	rev32	v25.16b,v25.16b
#endif
	rbit	v2.16b,v25.16b
	adrp	x9, .Lxts_magic
	ldr	q0, [x9, #:lo12:.Lxts_magic]
	shl	v17.16b, v2.16b, #1
	ext	v1.16b, v2.16b, v2.16b,#15
	ushr	v1.16b, v1.16b, #7
	mul	v1.16b, v1.16b, v0.16b
	eor	v17.16b, v17.16b, v1.16b
	rbit	v17.16b,v17.16b
	rbit	v2.16b,v17.16b
	adrp	x9, .Lxts_magic
	ldr	q0, [x9, #:lo12:.Lxts_magic]
	shl	v18.16b, v2.16b, #1
	ext	v1.16b, v2.16b, v2.16b,#15
	ushr	v1.16b, v1.16b, #7
	mul	v1.16b, v1.16b, v0.16b
	eor	v18.16b, v18.16b, v1.16b
	rbit	v18.16b,v18.16b
	b	.check_dec_gb


// This branch calculates the last two tweaks, 
// while the encryption/decryption length is equal to 32, who only need two tweaks
.only_2blks_tweak_gb:
	mov	v17.16b,v16.16b
#ifdef __AARCH64EB__
	rev32	v17.16b,v17.16b
#endif
	rbit	v2.16b,v17.16b
	adrp	x9, .Lxts_magic
	ldr	q0, [x9, #:lo12:.Lxts_magic]
	shl	v18.16b, v2.16b, #1
	ext	v1.16b, v2.16b, v2.16b,#15
	ushr	v1.16b, v1.16b, #7
	mul	v1.16b, v1.16b, v0.16b
	eor	v18.16b, v18.16b, v1.16b
	rbit	v18.16b,v18.16b
	b	.check_dec_gb


// Determine whether encryption or decryption is required.
// The last two tweaks need to be swapped for decryption.
.check_dec_gb:
	// encryption:1 decryption:0
	cmp	w28,1
	b.eq	.process_last_2blks_gb
	mov	v0.16B,v17.16b
	mov	v17.16B,v18.16b
	mov	v18.16B,v0.16b

.process_last_2blks_gb:
#ifdef __AARCH64EB__
	rev32	v17.16b,v17.16b
#endif
#ifdef __AARCH64EB__
	rev32	v18.16b,v18.16b
#endif
	ld1	{v4.4s},[x0],#16
	eor	v4.16b, v4.16b, v17.16b
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
	mov	x10,x3
	mov	w11,#8
	mov	w12,v4.s[0]
	mov	w13,v4.s[1]
	mov	w14,v4.s[2]
	mov	w15,v4.s[3]
10:
	ldp	w7,w8,[x10],8
	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	eor	w6,w14,w15
	eor	w9,w7,w13
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w12,w12,w6
	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	eor	w6,w14,w15
	eor	w9,w12,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	ldp	w7,w8,[x10],8
	eor	w13,w13,w6
	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	eor	w6,w12,w13
	eor	w9,w7,w15
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w14,w14,w6
	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	eor	w6,w12,w13
	eor	w9,w14,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w15,w15,w6
	subs	w11,w11,#1
	b.ne	10b
	mov	v4.s[0],w15
	mov	v4.s[1],w14
	mov	v4.s[2],w13
	mov	v4.s[3],w12
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
	eor	v4.16b, v4.16b, v17.16b
	st1	{v4.4s},[x1],#16

	sub	x26,x1,16
.loop_gb:
	subs	x29,x29,1
	ldrb	w7,[x26,x29]
	ldrb	w8,[x0,x29]
	strb	w8,[x26,x29]
	strb	w7,[x1,x29]
	b.gt	.loop_gb
	ld1	{v4.4s}, [x26]
	eor	v4.16b, v4.16b, v18.16b
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
	mov	x10,x3
	mov	w11,#8
	mov	w12,v4.s[0]
	mov	w13,v4.s[1]
	mov	w14,v4.s[2]
	mov	w15,v4.s[3]
10:
	ldp	w7,w8,[x10],8
	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	eor	w6,w14,w15
	eor	w9,w7,w13
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w12,w12,w6
	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	eor	w6,w14,w15
	eor	w9,w12,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	ldp	w7,w8,[x10],8
	eor	w13,w13,w6
	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	eor	w6,w12,w13
	eor	w9,w7,w15
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w14,w14,w6
	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	eor	w6,w12,w13
	eor	w9,w14,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w15,w15,w6
	subs	w11,w11,#1
	b.ne	10b
	mov	v4.s[0],w15
	mov	v4.s[1],w14
	mov	v4.s[2],w13
	mov	v4.s[3],w12
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
	eor	v4.16b, v4.16b, v18.16b
	st1	{v4.4s}, [x26]
.return_gb:
	ldp	d14, d15, [sp], #0x10
	ldp	d12, d13, [sp], #0x10
	ldp	d10, d11, [sp], #0x10
	ldp	d8, d9, [sp], #0x10
	ldp	x29, x30, [sp], #0x10
	ldp	x27, x28, [sp], #0x10
	ldp	x25, x26, [sp], #0x10
	ldp	x23, x24, [sp], #0x10
	ldp	x21, x22, [sp], #0x10
	ldp	x19, x20, [sp], #0x10
	ldp	x17, x18, [sp], #0x10
	ldp	x15, x16, [sp], #0x10
	AARCH64_VALIDATE_LINK_REGISTER
	ret
.size	vpsm4_ex_xts_encrypt_gb,.-vpsm4_ex_xts_encrypt_gb
.globl	vpsm4_ex_xts_encrypt
.type	vpsm4_ex_xts_encrypt,%function
.align	5
vpsm4_ex_xts_encrypt:
	AARCH64_SIGN_LINK_REGISTER
	stp	x15, x16, [sp, #-0x10]!
	stp	x17, x18, [sp, #-0x10]!
	stp	x19, x20, [sp, #-0x10]!
	stp	x21, x22, [sp, #-0x10]!
	stp	x23, x24, [sp, #-0x10]!
	stp	x25, x26, [sp, #-0x10]!
	stp	x27, x28, [sp, #-0x10]!
	stp	x29, x30, [sp, #-0x10]!
	stp	d8, d9, [sp, #-0x10]!
	stp	d10, d11, [sp, #-0x10]!
	stp	d12, d13, [sp, #-0x10]!
	stp	d14, d15, [sp, #-0x10]!
	mov	x26,x3
	mov	x27,x4
	mov	w28,w6
	ld1	{v16.4s}, [x5]
	mov	x3,x27
	adrp	x9, .Lsbox_magic
	add	x9, x9, #:lo12:.Lsbox_magic
	ldr	q26, [x9]
	ldr	q27, [x9, 16]
	ldr	q28, [x9, 32]
	ldr	q29, [x9, 48]
	ldr	q30, [x9, 64]
	ldr	q31, [x9, 80]
#ifndef __AARCH64EB__
	rev32	v16.16b,v16.16b
#endif
	mov	x10,x3
	mov	w11,#8
	mov	w12,v16.s[0]
	mov	w13,v16.s[1]
	mov	w14,v16.s[2]
	mov	w15,v16.s[3]
10:
	ldp	w7,w8,[x10],8
	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	eor	w6,w14,w15
	eor	w9,w7,w13
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w12,w12,w6
	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	eor	w6,w14,w15
	eor	w9,w12,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	ldp	w7,w8,[x10],8
	eor	w13,w13,w6
	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	eor	w6,w12,w13
	eor	w9,w7,w15
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w14,w14,w6
	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	eor	w6,w12,w13
	eor	w9,w14,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w15,w15,w6
	subs	w11,w11,#1
	b.ne	10b
	mov	v16.s[0],w15
	mov	v16.s[1],w14
	mov	v16.s[2],w13
	mov	v16.s[3],w12
#ifndef __AARCH64EB__
	rev32	v16.16b,v16.16b
#endif
	mov	x3,x26
	and	x29,x2,#0x0F
	// convert length into blocks
	lsr	x2,x2,4
	cmp	x2,#1
	b.lt	.return

	cmp	x29,0
	// If the encryption/decryption Length is N times of 16,
	// the all blocks are encrypted/decrypted in .xts_encrypt_blocks
	b.eq	.xts_encrypt_blocks

	// If the encryption/decryption length is not N times of 16,
	// the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak
	// the other blocks are encrypted/decrypted in .xts_encrypt_blocks
	subs	x2,x2,#1
	b.eq	.only_2blks_tweak
.xts_encrypt_blocks:
#ifdef __AARCH64EB__
	rev32	v16.16b,v16.16b
#endif
	mov	x12,v16.d[0]
	mov	x13,v16.d[1]
	mov	w7,0x87
	extr	x9,x13,x13,#32
	extr	x15,x13,x12,#63
	and	w8,w7,w9,asr#31
	eor	x14,x8,x12,lsl#1
	mov	w7,0x87
	extr	x9,x15,x15,#32
	extr	x17,x15,x14,#63
	and	w8,w7,w9,asr#31
	eor	x16,x8,x14,lsl#1
	mov	w7,0x87
	extr	x9,x17,x17,#32
	extr	x19,x17,x16,#63
	and	w8,w7,w9,asr#31
	eor	x18,x8,x16,lsl#1
	mov	w7,0x87
	extr	x9,x19,x19,#32
	extr	x21,x19,x18,#63
	and	w8,w7,w9,asr#31
	eor	x20,x8,x18,lsl#1
	mov	w7,0x87
	extr	x9,x21,x21,#32
	extr	x23,x21,x20,#63
	and	w8,w7,w9,asr#31
	eor	x22,x8,x20,lsl#1
	mov	w7,0x87
	extr	x9,x23,x23,#32
	extr	x25,x23,x22,#63
	and	w8,w7,w9,asr#31
	eor	x24,x8,x22,lsl#1
	mov	w7,0x87
	extr	x9,x25,x25,#32
	extr	x27,x25,x24,#63
	and	w8,w7,w9,asr#31
	eor	x26,x8,x24,lsl#1
.Lxts_8_blocks_process:
	cmp	x2,#8
	mov	v16.d[0],x12
	mov	v16.d[1],x13
#ifdef __AARCH64EB__
	rev32	v16.16b,v16.16b
#endif
	mov	w7,0x87
	extr	x9,x27,x27,#32
	extr	x13,x27,x26,#63
	and	w8,w7,w9,asr#31
	eor	x12,x8,x26,lsl#1
	mov	v17.d[0],x14
	mov	v17.d[1],x15
#ifdef __AARCH64EB__
	rev32	v17.16b,v17.16b
#endif
	mov	w7,0x87
	extr	x9,x13,x13,#32
	extr	x15,x13,x12,#63
	and	w8,w7,w9,asr#31
	eor	x14,x8,x12,lsl#1
	mov	v18.d[0],x16
	mov	v18.d[1],x17
#ifdef __AARCH64EB__
	rev32	v18.16b,v18.16b
#endif
	mov	w7,0x87
	extr	x9,x15,x15,#32
	extr	x17,x15,x14,#63
	and	w8,w7,w9,asr#31
	eor	x16,x8,x14,lsl#1
	mov	v19.d[0],x18
	mov	v19.d[1],x19
#ifdef __AARCH64EB__
	rev32	v19.16b,v19.16b
#endif
	mov	w7,0x87
	extr	x9,x17,x17,#32
	extr	x19,x17,x16,#63
	and	w8,w7,w9,asr#31
	eor	x18,x8,x16,lsl#1
	mov	v20.d[0],x20
	mov	v20.d[1],x21
#ifdef __AARCH64EB__
	rev32	v20.16b,v20.16b
#endif
	mov	w7,0x87
	extr	x9,x19,x19,#32
	extr	x21,x19,x18,#63
	and	w8,w7,w9,asr#31
	eor	x20,x8,x18,lsl#1
	mov	v21.d[0],x22
	mov	v21.d[1],x23
#ifdef __AARCH64EB__
	rev32	v21.16b,v21.16b
#endif
	mov	w7,0x87
	extr	x9,x21,x21,#32
	extr	x23,x21,x20,#63
	and	w8,w7,w9,asr#31
	eor	x22,x8,x20,lsl#1
	mov	v22.d[0],x24
	mov	v22.d[1],x25
#ifdef __AARCH64EB__
	rev32	v22.16b,v22.16b
#endif
	mov	w7,0x87
	extr	x9,x23,x23,#32
	extr	x25,x23,x22,#63
	and	w8,w7,w9,asr#31
	eor	x24,x8,x22,lsl#1
	mov	v23.d[0],x26
	mov	v23.d[1],x27
#ifdef __AARCH64EB__
	rev32	v23.16b,v23.16b
#endif
	mov	w7,0x87
	extr	x9,x25,x25,#32
	extr	x27,x25,x24,#63
	and	w8,w7,w9,asr#31
	eor	x26,x8,x24,lsl#1
	b.lt	.Lxts_4_blocks_process
	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
	eor	v4.16b, v4.16b, v16.16b
	eor	v5.16b, v5.16b, v17.16b
	eor	v6.16b, v6.16b, v18.16b
	eor	v7.16b, v7.16b, v19.16b
	ld1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
	eor	v8.16b, v8.16b, v20.16b
	eor	v9.16b, v9.16b, v21.16b
	eor	v10.16b, v10.16b, v22.16b
	eor	v11.16b, v11.16b, v23.16b
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
#ifndef __AARCH64EB__
	rev32	v5.16b,v5.16b
#endif
#ifndef __AARCH64EB__
	rev32	v6.16b,v6.16b
#endif
#ifndef __AARCH64EB__
	rev32	v7.16b,v7.16b
#endif
#ifndef __AARCH64EB__
	rev32	v8.16b,v8.16b
#endif
#ifndef __AARCH64EB__
	rev32	v9.16b,v9.16b
#endif
#ifndef __AARCH64EB__
	rev32	v10.16b,v10.16b
#endif
#ifndef __AARCH64EB__
	rev32	v11.16b,v11.16b
#endif
	zip1	v0.4s,v4.4s,v5.4s
	zip2	v1.4s,v4.4s,v5.4s
	zip1	v2.4s,v6.4s,v7.4s
	zip2	v3.4s,v6.4s,v7.4s
	zip1	v4.2d,v0.2d,v2.2d
	zip2	v5.2d,v0.2d,v2.2d
	zip1	v6.2d,v1.2d,v3.2d
	zip2	v7.2d,v1.2d,v3.2d
	zip1	v0.4s,v8.4s,v9.4s
	zip2	v1.4s,v8.4s,v9.4s
	zip1	v2.4s,v10.4s,v11.4s
	zip2	v3.4s,v10.4s,v11.4s
	zip1	v8.2d,v0.2d,v2.2d
	zip2	v9.2d,v0.2d,v2.2d
	zip1	v10.2d,v1.2d,v3.2d
	zip2	v11.2d,v1.2d,v3.2d
	bl	_vpsm4_ex_enc_8blks
	zip1	v8.4s,v0.4s,v1.4s
	zip2	v9.4s,v0.4s,v1.4s
	zip1	v10.4s,v2.4s,v3.4s
	zip2	v11.4s,v2.4s,v3.4s
	zip1	v0.2d,v8.2d,v10.2d
	zip2	v1.2d,v8.2d,v10.2d
	zip1	v2.2d,v9.2d,v11.2d
	zip2	v3.2d,v9.2d,v11.2d
	zip1	v8.4s,v4.4s,v5.4s
	zip2	v9.4s,v4.4s,v5.4s
	zip1	v10.4s,v6.4s,v7.4s
	zip2	v11.4s,v6.4s,v7.4s
	zip1	v4.2d,v8.2d,v10.2d
	zip2	v5.2d,v8.2d,v10.2d
	zip1	v6.2d,v9.2d,v11.2d
	zip2	v7.2d,v9.2d,v11.2d
	eor	v0.16b, v0.16b, v16.16b
	eor	v1.16b, v1.16b, v17.16b
	eor	v2.16b, v2.16b, v18.16b
	eor	v3.16b, v3.16b, v19.16b
	eor	v4.16b, v4.16b, v20.16b
	eor	v5.16b, v5.16b, v21.16b
	eor	v6.16b, v6.16b, v22.16b
	eor	v7.16b, v7.16b, v23.16b

	// save the last tweak
	mov	v25.16b,v23.16b
	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
	subs	x2,x2,#8
	b.gt	.Lxts_8_blocks_process
	b	100f
.Lxts_4_blocks_process:
	cmp	x2,#4
	b.lt	1f
	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
	eor	v4.16b, v4.16b, v16.16b
	eor	v5.16b, v5.16b, v17.16b
	eor	v6.16b, v6.16b, v18.16b
	eor	v7.16b, v7.16b, v19.16b
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
#ifndef __AARCH64EB__
	rev32	v5.16b,v5.16b
#endif
#ifndef __AARCH64EB__
	rev32	v6.16b,v6.16b
#endif
#ifndef __AARCH64EB__
	rev32	v7.16b,v7.16b
#endif
	zip1	v0.4s,v4.4s,v5.4s
	zip2	v1.4s,v4.4s,v5.4s
	zip1	v2.4s,v6.4s,v7.4s
	zip2	v3.4s,v6.4s,v7.4s
	zip1	v4.2d,v0.2d,v2.2d
	zip2	v5.2d,v0.2d,v2.2d
	zip1	v6.2d,v1.2d,v3.2d
	zip2	v7.2d,v1.2d,v3.2d
	bl	_vpsm4_ex_enc_4blks
	zip1	v4.4s,v0.4s,v1.4s
	zip2	v5.4s,v0.4s,v1.4s
	zip1	v6.4s,v2.4s,v3.4s
	zip2	v7.4s,v2.4s,v3.4s
	zip1	v0.2d,v4.2d,v6.2d
	zip2	v1.2d,v4.2d,v6.2d
	zip1	v2.2d,v5.2d,v7.2d
	zip2	v3.2d,v5.2d,v7.2d
	eor	v0.16b, v0.16b, v16.16b
	eor	v1.16b, v1.16b, v17.16b
	eor	v2.16b, v2.16b, v18.16b
	eor	v3.16b, v3.16b, v19.16b
	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
	sub	x2,x2,#4
	mov	v16.16b,v20.16b
	mov	v17.16b,v21.16b
	mov	v18.16b,v22.16b
	// save the last tweak
	mov	v25.16b,v19.16b
1:
	// process last block
	cmp	x2,#1
	b.lt	100f
	b.gt	1f
	ld1	{v4.4s},[x0],#16
	eor	v4.16b, v4.16b, v16.16b
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
	mov	x10,x3
	mov	w11,#8
	mov	w12,v4.s[0]
	mov	w13,v4.s[1]
	mov	w14,v4.s[2]
	mov	w15,v4.s[3]
10:
	ldp	w7,w8,[x10],8
	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	eor	w6,w14,w15
	eor	w9,w7,w13
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w12,w12,w6
	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	eor	w6,w14,w15
	eor	w9,w12,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	ldp	w7,w8,[x10],8
	eor	w13,w13,w6
	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	eor	w6,w12,w13
	eor	w9,w7,w15
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w14,w14,w6
	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	eor	w6,w12,w13
	eor	w9,w14,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w15,w15,w6
	subs	w11,w11,#1
	b.ne	10b
	mov	v4.s[0],w15
	mov	v4.s[1],w14
	mov	v4.s[2],w13
	mov	v4.s[3],w12
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
	eor	v4.16b, v4.16b, v16.16b
	st1	{v4.4s},[x1],#16
	// save the last tweak
	mov	v25.16b,v16.16b
	b	100f
1:	//	process last 2 blocks
	cmp	x2,#2
	b.gt	1f
	ld1	{v4.4s,v5.4s},[x0],#32
	eor	v4.16b, v4.16b, v16.16b
	eor	v5.16b, v5.16b, v17.16b
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
#ifndef __AARCH64EB__
	rev32	v5.16b,v5.16b
#endif
	zip1	v0.4s,v4.4s,v5.4s
	zip2	v1.4s,v4.4s,v5.4s
	zip1	v2.4s,v6.4s,v7.4s
	zip2	v3.4s,v6.4s,v7.4s
	zip1	v4.2d,v0.2d,v2.2d
	zip2	v5.2d,v0.2d,v2.2d
	zip1	v6.2d,v1.2d,v3.2d
	zip2	v7.2d,v1.2d,v3.2d
	bl	_vpsm4_ex_enc_4blks
	zip1	v4.4s,v0.4s,v1.4s
	zip2	v5.4s,v0.4s,v1.4s
	zip1	v6.4s,v2.4s,v3.4s
	zip2	v7.4s,v2.4s,v3.4s
	zip1	v0.2d,v4.2d,v6.2d
	zip2	v1.2d,v4.2d,v6.2d
	zip1	v2.2d,v5.2d,v7.2d
	zip2	v3.2d,v5.2d,v7.2d
	eor	v0.16b, v0.16b, v16.16b
	eor	v1.16b, v1.16b, v17.16b
	st1	{v0.4s,v1.4s},[x1],#32
	// save the last tweak
	mov	v25.16b,v17.16b
	b	100f
1:	//	process last 3 blocks
	ld1	{v4.4s,v5.4s,v6.4s},[x0],#48
	eor	v4.16b, v4.16b, v16.16b
	eor	v5.16b, v5.16b, v17.16b
	eor	v6.16b, v6.16b, v18.16b
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
#ifndef __AARCH64EB__
	rev32	v5.16b,v5.16b
#endif
#ifndef __AARCH64EB__
	rev32	v6.16b,v6.16b
#endif
	zip1	v0.4s,v4.4s,v5.4s
	zip2	v1.4s,v4.4s,v5.4s
	zip1	v2.4s,v6.4s,v7.4s
	zip2	v3.4s,v6.4s,v7.4s
	zip1	v4.2d,v0.2d,v2.2d
	zip2	v5.2d,v0.2d,v2.2d
	zip1	v6.2d,v1.2d,v3.2d
	zip2	v7.2d,v1.2d,v3.2d
	bl	_vpsm4_ex_enc_4blks
	zip1	v4.4s,v0.4s,v1.4s
	zip2	v5.4s,v0.4s,v1.4s
	zip1	v6.4s,v2.4s,v3.4s
	zip2	v7.4s,v2.4s,v3.4s
	zip1	v0.2d,v4.2d,v6.2d
	zip2	v1.2d,v4.2d,v6.2d
	zip1	v2.2d,v5.2d,v7.2d
	zip2	v3.2d,v5.2d,v7.2d
	eor	v0.16b, v0.16b, v16.16b
	eor	v1.16b, v1.16b, v17.16b
	eor	v2.16b, v2.16b, v18.16b
	st1	{v0.4s,v1.4s,v2.4s},[x1],#48
	// save the last tweak
	mov	v25.16b,v18.16b
100:
	cmp	x29,0
	b.eq	.return

// This branch calculates the last two tweaks, 
// while the encryption/decryption length is larger than 32
.last_2blks_tweak:
#ifdef __AARCH64EB__
	rev32	v25.16b,v25.16b
#endif
	mov	v2.16b,v25.16b
	adrp	x9, .Lxts_magic
	ldr	q0, [x9, #:lo12:.Lxts_magic]
	shl	v17.16b, v2.16b, #1
	ext	v1.16b, v2.16b, v2.16b,#15
	ushr	v1.16b, v1.16b, #7
	mul	v1.16b, v1.16b, v0.16b
	eor	v17.16b, v17.16b, v1.16b
	mov	v2.16b,v17.16b
	adrp	x9, .Lxts_magic
	ldr	q0, [x9, #:lo12:.Lxts_magic]
	shl	v18.16b, v2.16b, #1
	ext	v1.16b, v2.16b, v2.16b,#15
	ushr	v1.16b, v1.16b, #7
	mul	v1.16b, v1.16b, v0.16b
	eor	v18.16b, v18.16b, v1.16b
	b	.check_dec


// This branch calculates the last two tweaks, 
// while the encryption/decryption length is equal to 32, who only need two tweaks
.only_2blks_tweak:
	mov	v17.16b,v16.16b
#ifdef __AARCH64EB__
	rev32	v17.16b,v17.16b
#endif
	mov	v2.16b,v17.16b
	adrp	x9, .Lxts_magic
	ldr	q0, [x9, #:lo12:.Lxts_magic]
	shl	v18.16b, v2.16b, #1
	ext	v1.16b, v2.16b, v2.16b,#15
	ushr	v1.16b, v1.16b, #7
	mul	v1.16b, v1.16b, v0.16b
	eor	v18.16b, v18.16b, v1.16b
	b	.check_dec


// Determine whether encryption or decryption is required.
// The last two tweaks need to be swapped for decryption.
.check_dec:
	// encryption:1 decryption:0
	cmp	w28,1
	b.eq	.process_last_2blks
	mov	v0.16B,v17.16b
	mov	v17.16B,v18.16b
	mov	v18.16B,v0.16b

.process_last_2blks:
#ifdef __AARCH64EB__
	rev32	v17.16b,v17.16b
#endif
#ifdef __AARCH64EB__
	rev32	v18.16b,v18.16b
#endif
	ld1	{v4.4s},[x0],#16
	eor	v4.16b, v4.16b, v17.16b
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
	mov	x10,x3
	mov	w11,#8
	mov	w12,v4.s[0]
	mov	w13,v4.s[1]
	mov	w14,v4.s[2]
	mov	w15,v4.s[3]
10:
	ldp	w7,w8,[x10],8
	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	eor	w6,w14,w15
	eor	w9,w7,w13
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w12,w12,w6
	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	eor	w6,w14,w15
	eor	w9,w12,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	ldp	w7,w8,[x10],8
	eor	w13,w13,w6
	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	eor	w6,w12,w13
	eor	w9,w7,w15
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w14,w14,w6
	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	eor	w6,w12,w13
	eor	w9,w14,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w15,w15,w6
	subs	w11,w11,#1
	b.ne	10b
	mov	v4.s[0],w15
	mov	v4.s[1],w14
	mov	v4.s[2],w13
	mov	v4.s[3],w12
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
	eor	v4.16b, v4.16b, v17.16b
	st1	{v4.4s},[x1],#16

	sub	x26,x1,16
.loop:
	subs	x29,x29,1
	ldrb	w7,[x26,x29]
	ldrb	w8,[x0,x29]
	strb	w8,[x26,x29]
	strb	w7,[x1,x29]
	b.gt	.loop
	ld1	{v4.4s}, [x26]
	eor	v4.16b, v4.16b, v18.16b
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
	mov	x10,x3
	mov	w11,#8
	mov	w12,v4.s[0]
	mov	w13,v4.s[1]
	mov	w14,v4.s[2]
	mov	w15,v4.s[3]
10:
	ldp	w7,w8,[x10],8
	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
	eor	w6,w14,w15
	eor	w9,w7,w13
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w12,w12,w6
	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
	eor	w6,w14,w15
	eor	w9,w12,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	ldp	w7,w8,[x10],8
	eor	w13,w13,w6
	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
	eor	w6,w12,w13
	eor	w9,w7,w15
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w14,w14,w6
	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
	eor	w6,w12,w13
	eor	w9,w14,w8
	eor	w6,w6,w9
	mov	v3.s[0],w6
	// optimize sbox using AESE instruction
	tbl	v0.16b, {v3.16b}, v26.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v28.16b}, v0.16b
	tbl	v2.16b, {v27.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b
	eor	v1.16b, v1.16b, v1.16b
	aese	v0.16b,v1.16b
	ushr	v2.16b, v0.16b, 4
	and	v0.16b, v0.16b, v31.16b
	tbl	v0.16b, {v30.16b}, v0.16b
	tbl	v2.16b, {v29.16b}, v2.16b
	eor	v0.16b, v0.16b, v2.16b

	mov	w7,v0.s[0]
	eor	w6,w7,w7,ror #32-2
	eor	w6,w6,w7,ror #32-10
	eor	w6,w6,w7,ror #32-18
	eor	w6,w6,w7,ror #32-24
	eor	w15,w15,w6
	subs	w11,w11,#1
	b.ne	10b
	mov	v4.s[0],w15
	mov	v4.s[1],w14
	mov	v4.s[2],w13
	mov	v4.s[3],w12
#ifndef __AARCH64EB__
	rev32	v4.16b,v4.16b
#endif
	eor	v4.16b, v4.16b, v18.16b
	st1	{v4.4s}, [x26]
.return:
	ldp	d14, d15, [sp], #0x10
	ldp	d12, d13, [sp], #0x10
	ldp	d10, d11, [sp], #0x10
	ldp	d8, d9, [sp], #0x10
	ldp	x29, x30, [sp], #0x10
	ldp	x27, x28, [sp], #0x10
	ldp	x25, x26, [sp], #0x10
	ldp	x23, x24, [sp], #0x10
	ldp	x21, x22, [sp], #0x10
	ldp	x19, x20, [sp], #0x10
	ldp	x17, x18, [sp], #0x10
	ldp	x15, x16, [sp], #0x10
	AARCH64_VALIDATE_LINK_REGISTER
	ret
.size	vpsm4_ex_xts_encrypt,.-vpsm4_ex_xts_encrypt
