// Code generated by command: go run main.go -out ../../lsh512_amd64.s -stubs ../../lsh512_amd64_stubs.go -pkg lsh512. DO NOT EDIT.

//go:build amd64 && gc && !purego

#include "textflag.h"

DATA g_IV224<>+0(SB)/8, $0x0c401e9fe8813a55
DATA g_IV224<>+8(SB)/8, $0x4a5f446268fd3d35
DATA g_IV224<>+16(SB)/8, $0xff13e452334f612a
DATA g_IV224<>+24(SB)/8, $0xf8227661037e354a
DATA g_IV224<>+32(SB)/8, $0xa5f223723c9ca29d
DATA g_IV224<>+40(SB)/8, $0x95d965a11aed3979
DATA g_IV224<>+48(SB)/8, $0x01e23835b9ab02cc
DATA g_IV224<>+56(SB)/8, $0x52d49cbad5b30616
DATA g_IV224<>+64(SB)/8, $0x9e5c2027773f4ed3
DATA g_IV224<>+72(SB)/8, $0x66a5c8801925b701
DATA g_IV224<>+80(SB)/8, $0x22bbc85b4c6779d9
DATA g_IV224<>+88(SB)/8, $0xc13171a42c559c23
DATA g_IV224<>+96(SB)/8, $0x31e2b67d25be3813
DATA g_IV224<>+104(SB)/8, $0xd522c4deed8e4d83
DATA g_IV224<>+112(SB)/8, $0xa79f5509b43fbafe
DATA g_IV224<>+120(SB)/8, $0xe00d2cd88b4b6c6a
GLOBL g_IV224<>(SB), RODATA|NOPTR, $128

DATA g_IV256<>+0(SB)/8, $0x6dc57c33df989423
DATA g_IV256<>+8(SB)/8, $0xd8ea7f6e8342c199
DATA g_IV256<>+16(SB)/8, $0x76df8356f8603ac4
DATA g_IV256<>+24(SB)/8, $0x40f1b44de838223a
DATA g_IV256<>+32(SB)/8, $0x39ffe7cfc31484cd
DATA g_IV256<>+40(SB)/8, $0x39c4326cc5281548
DATA g_IV256<>+48(SB)/8, $0x8a2ff85a346045d8
DATA g_IV256<>+56(SB)/8, $0xff202aa46dbdd61e
DATA g_IV256<>+64(SB)/8, $0xcf785b3cd5fcdb8b
DATA g_IV256<>+72(SB)/8, $0x1f0323b64a8150bf
DATA g_IV256<>+80(SB)/8, $0xff75d972f29ea355
DATA g_IV256<>+88(SB)/8, $0x2e567f30bf1ca9e1
DATA g_IV256<>+96(SB)/8, $0xb596875bf8ff6dba
DATA g_IV256<>+104(SB)/8, $0xfcca39b089ef4615
DATA g_IV256<>+112(SB)/8, $0xecff4017d020b4b6
DATA g_IV256<>+120(SB)/8, $0x7e77384c772ed802
GLOBL g_IV256<>(SB), RODATA|NOPTR, $128

DATA g_IV384<>+0(SB)/8, $0x53156a66292808f6
DATA g_IV384<>+8(SB)/8, $0xb2c4f362b204c2bc
DATA g_IV384<>+16(SB)/8, $0xb84b7213bfa05c4e
DATA g_IV384<>+24(SB)/8, $0x976ceb7c1b299f73
DATA g_IV384<>+32(SB)/8, $0xdf0cc63c0570ae97
DATA g_IV384<>+40(SB)/8, $0xda4441baa486ce3f
DATA g_IV384<>+48(SB)/8, $0x6559f5d9b5f2acc2
DATA g_IV384<>+56(SB)/8, $0x22dacf19b4b52a16
DATA g_IV384<>+64(SB)/8, $0xbbcdacefde80953a
DATA g_IV384<>+72(SB)/8, $0xc9891a2879725b3e
DATA g_IV384<>+80(SB)/8, $0x7c9fe6330237e440
DATA g_IV384<>+88(SB)/8, $0xa30ba550553f7431
DATA g_IV384<>+96(SB)/8, $0xbb08043fb34e3e30
DATA g_IV384<>+104(SB)/8, $0xa0dec48d54618ead
DATA g_IV384<>+112(SB)/8, $0x150317267464bc57
DATA g_IV384<>+120(SB)/8, $0x32d1501fde63dc93
GLOBL g_IV384<>(SB), RODATA|NOPTR, $128

DATA g_IV512<>+0(SB)/8, $0xadd50f3c7f07094e
DATA g_IV512<>+8(SB)/8, $0xe3f3cee8f9418a4f
DATA g_IV512<>+16(SB)/8, $0xb527ecde5b3d0ae9
DATA g_IV512<>+24(SB)/8, $0x2ef6dec68076f501
DATA g_IV512<>+32(SB)/8, $0x8cb994cae5aca216
DATA g_IV512<>+40(SB)/8, $0xfbb9eae4bba48cc7
DATA g_IV512<>+48(SB)/8, $0x650a526174725fea
DATA g_IV512<>+56(SB)/8, $0x1f9a61a73f8d8085
DATA g_IV512<>+64(SB)/8, $0xb6607378173b539b
DATA g_IV512<>+72(SB)/8, $0x1bc99853b0c0b9ed
DATA g_IV512<>+80(SB)/8, $0xdf727fc19b182d47
DATA g_IV512<>+88(SB)/8, $0xdbef360cf893a457
DATA g_IV512<>+96(SB)/8, $0x4981f5e570147e80
DATA g_IV512<>+104(SB)/8, $0xd00c4490ca7d3e30
DATA g_IV512<>+112(SB)/8, $0x5d73940c0e4ae1ec
DATA g_IV512<>+120(SB)/8, $0x894085e2edb2d819
GLOBL g_IV512<>(SB), RODATA|NOPTR, $128

DATA g_StepConstants<>+0(SB)/8, $0x97884283c938982a
DATA g_StepConstants<>+8(SB)/8, $0xba1fca93533e2355
DATA g_StepConstants<>+16(SB)/8, $0xc519a2e87aeb1c03
DATA g_StepConstants<>+24(SB)/8, $0x9a0fc95462af17b1
DATA g_StepConstants<>+32(SB)/8, $0xfc3dda8ab019a82b
DATA g_StepConstants<>+40(SB)/8, $0x02825d079a895407
DATA g_StepConstants<>+48(SB)/8, $0x79f2d0a7ee06a6f7
DATA g_StepConstants<>+56(SB)/8, $0xd76d15eed9fdf5fe
DATA g_StepConstants<>+64(SB)/8, $0x1fcac64d01d0c2c1
DATA g_StepConstants<>+72(SB)/8, $0xd9ea5de69161790f
DATA g_StepConstants<>+80(SB)/8, $0xdebc8b6366071fc8
DATA g_StepConstants<>+88(SB)/8, $0xa9d91db711c6c94b
DATA g_StepConstants<>+96(SB)/8, $0x3a18653ac9c1d427
DATA g_StepConstants<>+104(SB)/8, $0x84df64a223dd5b09
DATA g_StepConstants<>+112(SB)/8, $0x6cc37895f4ad9e70
DATA g_StepConstants<>+120(SB)/8, $0x448304c8d7f3f4d5
DATA g_StepConstants<>+128(SB)/8, $0xea91134ed29383e0
DATA g_StepConstants<>+136(SB)/8, $0xc4484477f2da88e8
DATA g_StepConstants<>+144(SB)/8, $0x9b47eec96d26e8a6
DATA g_StepConstants<>+152(SB)/8, $0x82f6d4c8d89014f4
DATA g_StepConstants<>+160(SB)/8, $0x527da0048b95fb61
DATA g_StepConstants<>+168(SB)/8, $0x644406c60138648d
DATA g_StepConstants<>+176(SB)/8, $0x303c0e8aa24c0edc
DATA g_StepConstants<>+184(SB)/8, $0xc787cda0cbe8ca19
DATA g_StepConstants<>+192(SB)/8, $0x7ba46221661764ca
DATA g_StepConstants<>+200(SB)/8, $0x0c8cbc6acd6371ac
DATA g_StepConstants<>+208(SB)/8, $0xe336b836940f8f41
DATA g_StepConstants<>+216(SB)/8, $0x79cb9da168a50976
DATA g_StepConstants<>+224(SB)/8, $0xd01da49021915cb3
DATA g_StepConstants<>+232(SB)/8, $0xa84accc7399cf1f1
DATA g_StepConstants<>+240(SB)/8, $0x6c4a992cee5aeb0c
DATA g_StepConstants<>+248(SB)/8, $0x4f556e6cb4b2e3e0
DATA g_StepConstants<>+256(SB)/8, $0x200683877d7c2f45
DATA g_StepConstants<>+264(SB)/8, $0x9949273830d51db8
DATA g_StepConstants<>+272(SB)/8, $0x19eeeecaa39ed124
DATA g_StepConstants<>+280(SB)/8, $0x45693f0a0dae7fef
DATA g_StepConstants<>+288(SB)/8, $0xedc234b1b2ee1083
DATA g_StepConstants<>+296(SB)/8, $0xf3179400d68ee399
DATA g_StepConstants<>+304(SB)/8, $0xb6e3c61b4945f778
DATA g_StepConstants<>+312(SB)/8, $0xa4c3db216796c42f
DATA g_StepConstants<>+320(SB)/8, $0x268a0b04f9ab7465
DATA g_StepConstants<>+328(SB)/8, $0xe2705f6905f2d651
DATA g_StepConstants<>+336(SB)/8, $0x08ddb96e426ff53d
DATA g_StepConstants<>+344(SB)/8, $0xaea84917bc2e6f34
DATA g_StepConstants<>+352(SB)/8, $0xaff6e664a0fe9470
DATA g_StepConstants<>+360(SB)/8, $0x0aab94d765727d8c
DATA g_StepConstants<>+368(SB)/8, $0x9aa9e1648f3d702e
DATA g_StepConstants<>+376(SB)/8, $0x689efc88fe5af3d3
DATA g_StepConstants<>+384(SB)/8, $0xb0950ffea51fd98b
DATA g_StepConstants<>+392(SB)/8, $0x52cfc86ef8c92833
DATA g_StepConstants<>+400(SB)/8, $0xe69727b0b2653245
DATA g_StepConstants<>+408(SB)/8, $0x56f160d3ea9da3e2
DATA g_StepConstants<>+416(SB)/8, $0xa6dd4b059f93051f
DATA g_StepConstants<>+424(SB)/8, $0xb6406c3cd7f00996
DATA g_StepConstants<>+432(SB)/8, $0x448b45f3ccad9ec8
DATA g_StepConstants<>+440(SB)/8, $0x079b8587594ec73b
DATA g_StepConstants<>+448(SB)/8, $0x45a50ea3c4f9653b
DATA g_StepConstants<>+456(SB)/8, $0x22983767c1f15b85
DATA g_StepConstants<>+464(SB)/8, $0x7dbed8631797782b
DATA g_StepConstants<>+472(SB)/8, $0x485234be88418638
DATA g_StepConstants<>+480(SB)/8, $0x842850a5329824c5
DATA g_StepConstants<>+488(SB)/8, $0xf6aca914c7f9a04c
DATA g_StepConstants<>+496(SB)/8, $0xcfd139c07a4c670c
DATA g_StepConstants<>+504(SB)/8, $0xa3210ce0a8160242
DATA g_StepConstants<>+512(SB)/8, $0xeab3b268be5ea080
DATA g_StepConstants<>+520(SB)/8, $0xbacf9f29b34ce0a7
DATA g_StepConstants<>+528(SB)/8, $0x3c973b7aaf0fa3a8
DATA g_StepConstants<>+536(SB)/8, $0x9a86f346c9c7be80
DATA g_StepConstants<>+544(SB)/8, $0xac78f5d7cabcea49
DATA g_StepConstants<>+552(SB)/8, $0xa355bddcc199ed42
DATA g_StepConstants<>+560(SB)/8, $0xa10afa3ac6b373db
DATA g_StepConstants<>+568(SB)/8, $0xc42ded88be1844e5
DATA g_StepConstants<>+576(SB)/8, $0x9e661b271cff216a
DATA g_StepConstants<>+584(SB)/8, $0x8a6ec8dd002d8861
DATA g_StepConstants<>+592(SB)/8, $0xd3d2b629beb34be4
DATA g_StepConstants<>+600(SB)/8, $0x217a3a1091863f1a
DATA g_StepConstants<>+608(SB)/8, $0x256ecda287a733f5
DATA g_StepConstants<>+616(SB)/8, $0xf9139a9e5b872fe5
DATA g_StepConstants<>+624(SB)/8, $0xac0535017a274f7c
DATA g_StepConstants<>+632(SB)/8, $0xf21b7646d65d2aa9
DATA g_StepConstants<>+640(SB)/8, $0x048142441c208c08
DATA g_StepConstants<>+648(SB)/8, $0xf937a5dd2db5e9eb
DATA g_StepConstants<>+656(SB)/8, $0xa688dfe871ff30b7
DATA g_StepConstants<>+664(SB)/8, $0x9bb44aa217c5593b
DATA g_StepConstants<>+672(SB)/8, $0x943c702a2edb291a
DATA g_StepConstants<>+680(SB)/8, $0x0cae38f9e2b715de
DATA g_StepConstants<>+688(SB)/8, $0xb13a367ba176cc28
DATA g_StepConstants<>+696(SB)/8, $0x0d91bd1d3387d49b
DATA g_StepConstants<>+704(SB)/8, $0x85c386603cac940c
DATA g_StepConstants<>+712(SB)/8, $0x30dd830ae39fd5e4
DATA g_StepConstants<>+720(SB)/8, $0x2f68c85a712fe85d
DATA g_StepConstants<>+728(SB)/8, $0x4ffeecb9dd1e94d6
DATA g_StepConstants<>+736(SB)/8, $0xd0ac9a590a0443ae
DATA g_StepConstants<>+744(SB)/8, $0xbae732dc99ccf3ea
DATA g_StepConstants<>+752(SB)/8, $0xeb70b21d1842f4d9
DATA g_StepConstants<>+760(SB)/8, $0x9f4eda50bb5c6fa8
DATA g_StepConstants<>+768(SB)/8, $0x4949e69ce940a091
DATA g_StepConstants<>+776(SB)/8, $0x0e608dee8375ba14
DATA g_StepConstants<>+784(SB)/8, $0x983122cba118458c
DATA g_StepConstants<>+792(SB)/8, $0x4eeba696fbb36b25
DATA g_StepConstants<>+800(SB)/8, $0x7d46f3630e47f27e
DATA g_StepConstants<>+808(SB)/8, $0xa21a0f7666c0dea4
DATA g_StepConstants<>+816(SB)/8, $0x5c22cf355b37cec4
DATA g_StepConstants<>+824(SB)/8, $0xee292b0c17cc1847
DATA g_StepConstants<>+832(SB)/8, $0x9330838629e131da
DATA g_StepConstants<>+840(SB)/8, $0x6eee7c71f92fce22
DATA g_StepConstants<>+848(SB)/8, $0xc953ee6cb95dd224
DATA g_StepConstants<>+856(SB)/8, $0x3a923d92af1e9073
DATA g_StepConstants<>+864(SB)/8, $0xc43a5671563a70fb
DATA g_StepConstants<>+872(SB)/8, $0xbc2985dd279f8346
DATA g_StepConstants<>+880(SB)/8, $0x7ef2049093069320
DATA g_StepConstants<>+888(SB)/8, $0x17543723e3e46035
DATA g_StepConstants<>+896(SB)/8, $0xc3b409b00b130c6d
DATA g_StepConstants<>+904(SB)/8, $0x5d6aee6b28fdf090
DATA g_StepConstants<>+912(SB)/8, $0x1d425b26172ff6ed
DATA g_StepConstants<>+920(SB)/8, $0xcccfd041cdaf03ad
DATA g_StepConstants<>+928(SB)/8, $0xfe90c7c790ab6cbf
DATA g_StepConstants<>+936(SB)/8, $0xe5af6304c722ca02
DATA g_StepConstants<>+944(SB)/8, $0x70f695239999b39e
DATA g_StepConstants<>+952(SB)/8, $0x6b8b5b07c844954c
DATA g_StepConstants<>+960(SB)/8, $0x77bdb9bb1e1f7a30
DATA g_StepConstants<>+968(SB)/8, $0xc859599426ee80ed
DATA g_StepConstants<>+976(SB)/8, $0x5f9d813d4726e40a
DATA g_StepConstants<>+984(SB)/8, $0x9ca0120f7cb2b179
DATA g_StepConstants<>+992(SB)/8, $0x8f588f583c182cbd
DATA g_StepConstants<>+1000(SB)/8, $0x951267cbe9eccce7
DATA g_StepConstants<>+1008(SB)/8, $0x678bb8bd334d520e
DATA g_StepConstants<>+1016(SB)/8, $0xf6e662d00cd9e1b7
DATA g_StepConstants<>+1024(SB)/8, $0x357774d93d99aaa7
DATA g_StepConstants<>+1032(SB)/8, $0x21b2edbb156f6eb5
DATA g_StepConstants<>+1040(SB)/8, $0xfd1ebe846e0aee69
DATA g_StepConstants<>+1048(SB)/8, $0x3cb2218c2f642b15
DATA g_StepConstants<>+1056(SB)/8, $0xe7e7e7945444ea4c
DATA g_StepConstants<>+1064(SB)/8, $0xa77a33b5d6b9b47c
DATA g_StepConstants<>+1072(SB)/8, $0xf34475f0809f6075
DATA g_StepConstants<>+1080(SB)/8, $0xdd4932dce6bb99ad
DATA g_StepConstants<>+1088(SB)/8, $0xacec4e16d74451dc
DATA g_StepConstants<>+1096(SB)/8, $0xd4a0a8d084de23d6
DATA g_StepConstants<>+1104(SB)/8, $0x1bdd42f278f95866
DATA g_StepConstants<>+1112(SB)/8, $0xeed3adbb938f4051
DATA g_StepConstants<>+1120(SB)/8, $0xcfcf7be8992f3733
DATA g_StepConstants<>+1128(SB)/8, $0x21ade98c906e3123
DATA g_StepConstants<>+1136(SB)/8, $0x37ba66711fffd668
DATA g_StepConstants<>+1144(SB)/8, $0x267c0fc3a255478a
DATA g_StepConstants<>+1152(SB)/8, $0x993a64ee1b962e88
DATA g_StepConstants<>+1160(SB)/8, $0x754979556301faaa
DATA g_StepConstants<>+1168(SB)/8, $0xf920356b7251be81
DATA g_StepConstants<>+1176(SB)/8, $0xc281694f22cf923f
DATA g_StepConstants<>+1184(SB)/8, $0x9f4b6481c8666b02
DATA g_StepConstants<>+1192(SB)/8, $0xcf97761cfe9f5444
DATA g_StepConstants<>+1200(SB)/8, $0xf220d7911fd63e9f
DATA g_StepConstants<>+1208(SB)/8, $0xa28bd365f79cd1b0
DATA g_StepConstants<>+1216(SB)/8, $0xd39f5309b1c4b721
DATA g_StepConstants<>+1224(SB)/8, $0xbec2ceb864fca51f
DATA g_StepConstants<>+1232(SB)/8, $0x1955a0ddc410407a
DATA g_StepConstants<>+1240(SB)/8, $0x43eab871f261d201
DATA g_StepConstants<>+1248(SB)/8, $0xeaafe64a2ed16da1
DATA g_StepConstants<>+1256(SB)/8, $0x670d931b9df39913
DATA g_StepConstants<>+1264(SB)/8, $0x12f868b0f614de91
DATA g_StepConstants<>+1272(SB)/8, $0x2e5f395d946e8252
DATA g_StepConstants<>+1280(SB)/8, $0x72f25cbb767bd8f4
DATA g_StepConstants<>+1288(SB)/8, $0x8191871d61a1c4dd
DATA g_StepConstants<>+1296(SB)/8, $0x6ef67ea1d450ba93
DATA g_StepConstants<>+1304(SB)/8, $0x2ea32a645433d344
DATA g_StepConstants<>+1312(SB)/8, $0x9a963079003f0f8b
DATA g_StepConstants<>+1320(SB)/8, $0x74a0aeb9918cac7a
DATA g_StepConstants<>+1328(SB)/8, $0x0b6119a70af36fa3
DATA g_StepConstants<>+1336(SB)/8, $0x8d9896f202f0d480
DATA g_StepConstants<>+1344(SB)/8, $0x654f1831f254cd66
DATA g_StepConstants<>+1352(SB)/8, $0x1318a47f0366a25e
DATA g_StepConstants<>+1360(SB)/8, $0x65752076250b4e01
DATA g_StepConstants<>+1368(SB)/8, $0xd1cd8eb888071772
DATA g_StepConstants<>+1376(SB)/8, $0x30c6a9793f4e9b25
DATA g_StepConstants<>+1384(SB)/8, $0x154f684b1e3926ee
DATA g_StepConstants<>+1392(SB)/8, $0x6c7ac0b1fe6312ae
DATA g_StepConstants<>+1400(SB)/8, $0x262f88f4f3c5550d
DATA g_StepConstants<>+1408(SB)/8, $0xb4674a24472233cb
DATA g_StepConstants<>+1416(SB)/8, $0x2bbd23826a090071
DATA g_StepConstants<>+1424(SB)/8, $0xda95969b30594f66
DATA g_StepConstants<>+1432(SB)/8, $0x9f5c47408f1e8a43
DATA g_StepConstants<>+1440(SB)/8, $0xf77022b88de9c055
DATA g_StepConstants<>+1448(SB)/8, $0x64b7b36957601503
DATA g_StepConstants<>+1456(SB)/8, $0xe73b72b06175c11a
DATA g_StepConstants<>+1464(SB)/8, $0x55b87de8b91a6233
DATA g_StepConstants<>+1472(SB)/8, $0x1bb16e6b6955ff7f
DATA g_StepConstants<>+1480(SB)/8, $0xe8e0a5ec7309719c
DATA g_StepConstants<>+1488(SB)/8, $0x702c31cb89a8b640
DATA g_StepConstants<>+1496(SB)/8, $0xfba387cfada8cde2
DATA g_StepConstants<>+1504(SB)/8, $0x6792db4677aa164c
DATA g_StepConstants<>+1512(SB)/8, $0x1c6b1cc0b7751867
DATA g_StepConstants<>+1520(SB)/8, $0x22ae2311d736dc01
DATA g_StepConstants<>+1528(SB)/8, $0x0e3666a1d37c9588
DATA g_StepConstants<>+1536(SB)/8, $0xcd1fd9d4bf557e9a
DATA g_StepConstants<>+1544(SB)/8, $0xc986925f7c7b0e84
DATA g_StepConstants<>+1552(SB)/8, $0x9c5dfd55325ef6b0
DATA g_StepConstants<>+1560(SB)/8, $0x9f2b577d5676b0dd
DATA g_StepConstants<>+1568(SB)/8, $0xfa6e21be21c062b3
DATA g_StepConstants<>+1576(SB)/8, $0x8787dd782c8d7f83
DATA g_StepConstants<>+1584(SB)/8, $0xd0d134e90e12dd23
DATA g_StepConstants<>+1592(SB)/8, $0x449d087550121d96
DATA g_StepConstants<>+1600(SB)/8, $0xecf9ae9414d41967
DATA g_StepConstants<>+1608(SB)/8, $0x5018f1dbf789934d
DATA g_StepConstants<>+1616(SB)/8, $0xfa5b52879155a74c
DATA g_StepConstants<>+1624(SB)/8, $0xca82d4d3cd278e7c
DATA g_StepConstants<>+1632(SB)/8, $0x688fdfdfe22316ad
DATA g_StepConstants<>+1640(SB)/8, $0x0f6555a4ba0d030a
DATA g_StepConstants<>+1648(SB)/8, $0xa2061df720f000f3
DATA g_StepConstants<>+1656(SB)/8, $0xe1a57dc5622fb3da
DATA g_StepConstants<>+1664(SB)/8, $0xe6a842a8e8ed8153
DATA g_StepConstants<>+1672(SB)/8, $0x690acdd3811ce09d
DATA g_StepConstants<>+1680(SB)/8, $0x55adda18e6fcf446
DATA g_StepConstants<>+1688(SB)/8, $0x4d57a8a0f4b60b46
DATA g_StepConstants<>+1696(SB)/8, $0xf86fbfc20539c415
DATA g_StepConstants<>+1704(SB)/8, $0x74bafa5ec7100d19
DATA g_StepConstants<>+1712(SB)/8, $0xa824151810f0f495
DATA g_StepConstants<>+1720(SB)/8, $0x8723432791e38ebb
DATA g_StepConstants<>+1728(SB)/8, $0x8eeaeb91d66ed539
DATA g_StepConstants<>+1736(SB)/8, $0x73d8a1549dfd7e06
DATA g_StepConstants<>+1744(SB)/8, $0x0387f2ffe3f13a9b
DATA g_StepConstants<>+1752(SB)/8, $0xa5004995aac15193
DATA g_StepConstants<>+1760(SB)/8, $0x682f81c73efdda0d
DATA g_StepConstants<>+1768(SB)/8, $0x2fb55925d71d268d
DATA g_StepConstants<>+1776(SB)/8, $0xcc392d2901e58a3d
DATA g_StepConstants<>+1784(SB)/8, $0xaa666ab975724a42
GLOBL g_StepConstants<>(SB), RODATA|NOPTR, $1792

DATA g_BytePermInfo_avx2<>+0(SB)/8, $0x0706050403020100
DATA g_BytePermInfo_avx2<>+8(SB)/8, $0x0d0c0b0a09080f0e
DATA g_BytePermInfo_avx2<>+16(SB)/8, $0x1312111017161514
DATA g_BytePermInfo_avx2<>+24(SB)/8, $0x19181f1e1d1c1b1a
DATA g_BytePermInfo_avx2<>+32(SB)/8, $0x0605040302010007
DATA g_BytePermInfo_avx2<>+40(SB)/8, $0x0c0b0a09080f0e0d
DATA g_BytePermInfo_avx2<>+48(SB)/8, $0x1211101716151413
DATA g_BytePermInfo_avx2<>+56(SB)/8, $0x181f1e1d1c1b1a19
GLOBL g_BytePermInfo_avx2<>(SB), RODATA|NOPTR, $64

DATA g_MsgWordPermInfo_avx2<>+0(SB)/8, $0x0706050403020100
DATA g_MsgWordPermInfo_avx2<>+8(SB)/8, $0x0f0e0d0c0b0a0908
DATA g_MsgWordPermInfo_avx2<>+16(SB)/8, $0x1716151413121110
DATA g_MsgWordPermInfo_avx2<>+24(SB)/8, $0x1f1e1d1c1b1a1918
GLOBL g_MsgWordPermInfo_avx2<>(SB), RODATA|NOPTR, $32

DATA g_BytePermInfo_sse2<>+0(SB)/4, $0x00000000
DATA g_BytePermInfo_sse2<>+4(SB)/4, $0x00000000
DATA g_BytePermInfo_sse2<>+8(SB)/4, $0xffffffff
DATA g_BytePermInfo_sse2<>+12(SB)/4, $0xffffffff
DATA g_BytePermInfo_sse2<>+16(SB)/4, $0xffffffff
DATA g_BytePermInfo_sse2<>+20(SB)/4, $0xffffffff
DATA g_BytePermInfo_sse2<>+24(SB)/4, $0x00000000
DATA g_BytePermInfo_sse2<>+28(SB)/4, $0x00000000
GLOBL g_BytePermInfo_sse2<>(SB), RODATA|NOPTR, $32

DATA g_BytePermInfo_ssse3<>+0(SB)/8, $0x0706050403020100
DATA g_BytePermInfo_ssse3<>+8(SB)/8, $0x0d0c0b0a09080f0e
DATA g_BytePermInfo_ssse3<>+16(SB)/8, $0x0302010007060504
DATA g_BytePermInfo_ssse3<>+24(SB)/8, $0x09080f0e0d0c0b0a
DATA g_BytePermInfo_ssse3<>+32(SB)/8, $0x0605040302010007
DATA g_BytePermInfo_ssse3<>+40(SB)/8, $0x0c0b0a09080f0e0d
DATA g_BytePermInfo_ssse3<>+48(SB)/8, $0x0201000706050403
DATA g_BytePermInfo_ssse3<>+56(SB)/8, $0x080f0e0d0c0b0a09
GLOBL g_BytePermInfo_ssse3<>(SB), RODATA|NOPTR, $64

// func lsh512InitSSE2(ctx *lsh512ContextAsmData)
// Requires: SSE2
TEXT ·lsh512InitSSE2(SB), NOSPLIT, $0-8
	MOVQ ctx+0(FP), AX
	MOVL (AX), CX
	MOVQ 8(AX), DX

	// lsh512_sse2_init
	CMPL CX, $0x00000040
	JNE  lsh512_sse2_init_if0_end

	// init512
	// load_blk_mem2mem
	// MemcpyStatic
	MOVOA g_IV512<>+0(SB), X0
	MOVOU X0, 16(AX)
	MOVOA g_IV512<>+16(SB), X0
	MOVOU X0, 32(AX)
	MOVOA g_IV512<>+32(SB), X0
	MOVOU X0, 48(AX)
	MOVOA g_IV512<>+48(SB), X0
	MOVOU X0, 64(AX)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOA g_IV512<>+64(SB), X0
	MOVOU X0, 80(AX)
	MOVOA g_IV512<>+80(SB), X0
	MOVOU X0, 96(AX)
	MOVOA g_IV512<>+96(SB), X0
	MOVOU X0, 112(AX)
	MOVOA g_IV512<>+112(SB), X0
	MOVOU X0, 128(AX)
	JMP   lsh512_sse2_init_ret

lsh512_sse2_init_if0_end:
	CMPL CX, $0x00000030
	JNE  lsh512_sse2_init_if1_end

	// init384
	// load_blk_mem2mem
	// MemcpyStatic
	MOVOA g_IV384<>+0(SB), X0
	MOVOU X0, 16(AX)
	MOVOA g_IV384<>+16(SB), X0
	MOVOU X0, 32(AX)
	MOVOA g_IV384<>+32(SB), X0
	MOVOU X0, 48(AX)
	MOVOA g_IV384<>+48(SB), X0
	MOVOU X0, 64(AX)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOA g_IV384<>+64(SB), X0
	MOVOU X0, 80(AX)
	MOVOA g_IV384<>+80(SB), X0
	MOVOU X0, 96(AX)
	MOVOA g_IV384<>+96(SB), X0
	MOVOU X0, 112(AX)
	MOVOA g_IV384<>+112(SB), X0
	MOVOU X0, 128(AX)
	JMP   lsh512_sse2_init_ret

lsh512_sse2_init_if1_end:
	CMPL CX, $0x00000020
	JNE  lsh512_sse2_init_if2_end

	// init256
	// load_blk_mem2mem
	// MemcpyStatic
	MOVOA g_IV256<>+0(SB), X0
	MOVOU X0, 16(AX)
	MOVOA g_IV256<>+16(SB), X0
	MOVOU X0, 32(AX)
	MOVOA g_IV256<>+32(SB), X0
	MOVOU X0, 48(AX)
	MOVOA g_IV256<>+48(SB), X0
	MOVOU X0, 64(AX)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOA g_IV256<>+64(SB), X0
	MOVOU X0, 80(AX)
	MOVOA g_IV256<>+80(SB), X0
	MOVOU X0, 96(AX)
	MOVOA g_IV256<>+96(SB), X0
	MOVOU X0, 112(AX)
	MOVOA g_IV256<>+112(SB), X0
	MOVOU X0, 128(AX)
	JMP   lsh512_sse2_init_ret

lsh512_sse2_init_if2_end:
	// init224
	// load_blk_mem2mem
	// MemcpyStatic
	MOVOA g_IV224<>+0(SB), X0
	MOVOU X0, 16(AX)
	MOVOA g_IV224<>+16(SB), X0
	MOVOU X0, 32(AX)
	MOVOA g_IV224<>+32(SB), X0
	MOVOU X0, 48(AX)
	MOVOA g_IV224<>+48(SB), X0
	MOVOU X0, 64(AX)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOA g_IV224<>+64(SB), X0
	MOVOU X0, 80(AX)
	MOVOA g_IV224<>+80(SB), X0
	MOVOU X0, 96(AX)
	MOVOA g_IV224<>+96(SB), X0
	MOVOU X0, 112(AX)
	MOVOA g_IV224<>+112(SB), X0
	MOVOU X0, 128(AX)

lsh512_sse2_init_ret:
	MOVQ ctx+0(FP), AX
	MOVQ DX, 8(AX)
	RET

// func lsh512UpdateSSE2(ctx *lsh512ContextAsmData, data []byte)
// Requires: SSE2
TEXT ·lsh512UpdateSSE2(SB), NOSPLIT, $256-32
	MOVQ ctx+0(FP), AX
	MOVL (AX), CX
	MOVQ 8(AX), CX
	MOVQ data_base+8(FP), DX
	MOVQ data_len+16(FP), BX

	// lsh512_sse2_update
	MOVQ CX, SI
	MOVQ BX, DI
	ADDQ SI, DI
	CMPQ DI, $0x00000100
	JGE  lsh512_sse2_update_if0_end

	// Memcpy
	LEAQ 144(AX)(SI*1), AX
	LEAQ (DX), DI
	MOVQ BX, R8

memcpy_1_sz16_start:
	CMPQ  R8, $0x00000010
	JL    memcpy_1_sz16_end
	MOVOU (DI), X0
	MOVOU X0, (AX)
	ADDQ  $0x00000010, DI
	ADDQ  $0x00000010, AX
	SUBQ  $0x00000010, R8
	JMP   memcpy_1_sz16_start

memcpy_1_sz16_end:
memcpy_1_sz8_start:
	CMPQ R8, $0x00000008
	JL   memcpy_1_sz8_end
	MOVQ (DI), DX
	MOVQ DX, (AX)
	ADDQ $0x00000008, DI
	ADDQ $0x00000008, AX
	SUBQ $0x00000008, R8
	JMP  memcpy_1_sz8_start

memcpy_1_sz8_end:
memcpy_1_sz4_start:
	CMPQ R8, $0x00000004
	JL   memcpy_1_sz4_end
	MOVL (DI), DX
	MOVL DX, (AX)
	ADDQ $0x00000004, DI
	ADDQ $0x00000004, AX
	SUBQ $0x00000004, R8
	JMP  memcpy_1_sz4_start

memcpy_1_sz4_end:
memcpy_1_sz2_start:
	CMPQ R8, $0x00000002
	JL   memcpy_1_sz2_end
	MOVW (DI), DX
	MOVW DX, (AX)
	ADDQ $0x00000002, DI
	ADDQ $0x00000002, AX
	SUBQ $0x00000002, R8
	JMP  memcpy_1_sz2_start

memcpy_1_sz2_end:
memcpy_1_sz1_start:
	CMPQ R8, $0x00000001
	JL   memcpy_1_sz1_end
	MOVB (DI), DL
	MOVB DL, (AX)
	ADDQ $0x00000001, DI
	ADDQ $0x00000001, AX
	SUBQ $0x00000001, R8
	JMP  memcpy_1_sz1_start

memcpy_1_sz1_end:
	ADDQ BX, CX
	ADDQ BX, SI
	JMP  lsh512_sse2_update_ret

lsh512_sse2_update_if0_end:
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7
	CMPQ  SI, $0x00000000
	JE    lsh512_sse2_update_if1_end
	MOVQ  $0x00000100, CX
	SUBQ  SI, CX

	// Memcpy
	LEAQ 144(AX)(SI*1), DI
	LEAQ (DX), R8
	MOVQ CX, R9

memcpy_2_sz16_start:
	CMPQ  R9, $0x00000010
	JL    memcpy_2_sz16_end
	MOVOU (R8), X8
	MOVOU X8, (DI)
	ADDQ  $0x00000010, R8
	ADDQ  $0x00000010, DI
	SUBQ  $0x00000010, R9
	JMP   memcpy_2_sz16_start

memcpy_2_sz16_end:
memcpy_2_sz8_start:
	CMPQ R9, $0x00000008
	JL   memcpy_2_sz8_end
	MOVQ (R8), SI
	MOVQ SI, (DI)
	ADDQ $0x00000008, R8
	ADDQ $0x00000008, DI
	SUBQ $0x00000008, R9
	JMP  memcpy_2_sz8_start

memcpy_2_sz8_end:
memcpy_2_sz4_start:
	CMPQ R9, $0x00000004
	JL   memcpy_2_sz4_end
	MOVL (R8), SI
	MOVL SI, (DI)
	ADDQ $0x00000004, R8
	ADDQ $0x00000004, DI
	SUBQ $0x00000004, R9
	JMP  memcpy_2_sz4_start

memcpy_2_sz4_end:
memcpy_2_sz2_start:
	CMPQ R9, $0x00000002
	JL   memcpy_2_sz2_end
	MOVW (R8), SI
	MOVW SI, (DI)
	ADDQ $0x00000002, R8
	ADDQ $0x00000002, DI
	SUBQ $0x00000002, R9
	JMP  memcpy_2_sz2_start

memcpy_2_sz2_end:
memcpy_2_sz1_start:
	CMPQ R9, $0x00000001
	JL   memcpy_2_sz1_end
	MOVB (R8), SI
	MOVB SI, (DI)
	ADDQ $0x00000001, R8
	ADDQ $0x00000001, DI
	SUBQ $0x00000001, R9
	JMP  memcpy_2_sz1_start

memcpy_2_sz1_end:
	// compress
	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 144(AX), X8
	MOVOU X8, (SP)
	MOVOU 160(AX), X8
	MOVOU X8, 16(SP)
	MOVOU 176(AX), X8
	MOVOU X8, 32(SP)
	MOVOU 192(AX), X8
	MOVOU X8, 48(SP)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 208(AX), X8
	MOVOU X8, 64(SP)
	MOVOU 224(AX), X8
	MOVOU X8, 80(SP)
	MOVOU 240(AX), X8
	MOVOU X8, 96(SP)
	MOVOU 256(AX), X8
	MOVOU X8, 112(SP)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 272(AX), X8
	MOVOU X8, 128(SP)
	MOVOU 288(AX), X8
	MOVOU X8, 144(SP)
	MOVOU 304(AX), X8
	MOVOU X8, 160(SP)
	MOVOU 320(AX), X8
	MOVOU X8, 176(SP)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 336(AX), X8
	MOVOU X8, 192(SP)
	MOVOU 352(AX), X8
	MOVOU X8, 208(SP)
	MOVOU 368(AX), X8
	MOVOU X8, 224(SP)
	MOVOU 384(AX), X8
	MOVOU X8, 240(SP)

	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+0(SB), X8
	MOVOA g_StepConstants<>+16(SB), X9
	MOVOA g_StepConstants<>+32(SB), X10
	MOVOA g_StepConstants<>+48(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+64(SB), X8
	MOVOA g_StepConstants<>+80(SB), X9
	MOVOA g_StepConstants<>+96(SB), X10
	MOVOA g_StepConstants<>+112(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+128(SB), X8
	MOVOA g_StepConstants<>+144(SB), X9
	MOVOA g_StepConstants<>+160(SB), X10
	MOVOA g_StepConstants<>+176(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+192(SB), X8
	MOVOA g_StepConstants<>+208(SB), X9
	MOVOA g_StepConstants<>+224(SB), X10
	MOVOA g_StepConstants<>+240(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+256(SB), X8
	MOVOA g_StepConstants<>+272(SB), X9
	MOVOA g_StepConstants<>+288(SB), X10
	MOVOA g_StepConstants<>+304(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+320(SB), X8
	MOVOA g_StepConstants<>+336(SB), X9
	MOVOA g_StepConstants<>+352(SB), X10
	MOVOA g_StepConstants<>+368(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+384(SB), X8
	MOVOA g_StepConstants<>+400(SB), X9
	MOVOA g_StepConstants<>+416(SB), X10
	MOVOA g_StepConstants<>+432(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+448(SB), X8
	MOVOA g_StepConstants<>+464(SB), X9
	MOVOA g_StepConstants<>+480(SB), X10
	MOVOA g_StepConstants<>+496(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+512(SB), X8
	MOVOA g_StepConstants<>+528(SB), X9
	MOVOA g_StepConstants<>+544(SB), X10
	MOVOA g_StepConstants<>+560(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+576(SB), X8
	MOVOA g_StepConstants<>+592(SB), X9
	MOVOA g_StepConstants<>+608(SB), X10
	MOVOA g_StepConstants<>+624(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+640(SB), X8
	MOVOA g_StepConstants<>+656(SB), X9
	MOVOA g_StepConstants<>+672(SB), X10
	MOVOA g_StepConstants<>+688(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+704(SB), X8
	MOVOA g_StepConstants<>+720(SB), X9
	MOVOA g_StepConstants<>+736(SB), X10
	MOVOA g_StepConstants<>+752(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+768(SB), X8
	MOVOA g_StepConstants<>+784(SB), X9
	MOVOA g_StepConstants<>+800(SB), X10
	MOVOA g_StepConstants<>+816(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+832(SB), X8
	MOVOA g_StepConstants<>+848(SB), X9
	MOVOA g_StepConstants<>+864(SB), X10
	MOVOA g_StepConstants<>+880(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+896(SB), X8
	MOVOA g_StepConstants<>+912(SB), X9
	MOVOA g_StepConstants<>+928(SB), X10
	MOVOA g_StepConstants<>+944(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+960(SB), X8
	MOVOA g_StepConstants<>+976(SB), X9
	MOVOA g_StepConstants<>+992(SB), X10
	MOVOA g_StepConstants<>+1008(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1024(SB), X8
	MOVOA g_StepConstants<>+1040(SB), X9
	MOVOA g_StepConstants<>+1056(SB), X10
	MOVOA g_StepConstants<>+1072(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1088(SB), X8
	MOVOA g_StepConstants<>+1104(SB), X9
	MOVOA g_StepConstants<>+1120(SB), X10
	MOVOA g_StepConstants<>+1136(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1152(SB), X8
	MOVOA g_StepConstants<>+1168(SB), X9
	MOVOA g_StepConstants<>+1184(SB), X10
	MOVOA g_StepConstants<>+1200(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1216(SB), X8
	MOVOA g_StepConstants<>+1232(SB), X9
	MOVOA g_StepConstants<>+1248(SB), X10
	MOVOA g_StepConstants<>+1264(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1280(SB), X8
	MOVOA g_StepConstants<>+1296(SB), X9
	MOVOA g_StepConstants<>+1312(SB), X10
	MOVOA g_StepConstants<>+1328(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1344(SB), X8
	MOVOA g_StepConstants<>+1360(SB), X9
	MOVOA g_StepConstants<>+1376(SB), X10
	MOVOA g_StepConstants<>+1392(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1408(SB), X8
	MOVOA g_StepConstants<>+1424(SB), X9
	MOVOA g_StepConstants<>+1440(SB), X10
	MOVOA g_StepConstants<>+1456(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1472(SB), X8
	MOVOA g_StepConstants<>+1488(SB), X9
	MOVOA g_StepConstants<>+1504(SB), X10
	MOVOA g_StepConstants<>+1520(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1536(SB), X8
	MOVOA g_StepConstants<>+1552(SB), X9
	MOVOA g_StepConstants<>+1568(SB), X10
	MOVOA g_StepConstants<>+1584(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1600(SB), X8
	MOVOA g_StepConstants<>+1616(SB), X9
	MOVOA g_StepConstants<>+1632(SB), X10
	MOVOA g_StepConstants<>+1648(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1664(SB), X8
	MOVOA g_StepConstants<>+1680(SB), X9
	MOVOA g_StepConstants<>+1696(SB), X10
	MOVOA g_StepConstants<>+1712(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1728(SB), X8
	MOVOA g_StepConstants<>+1744(SB), X9
	MOVOA g_StepConstants<>+1760(SB), X10
	MOVOA g_StepConstants<>+1776(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7
	ADDQ  CX, DX
	SUBQ  CX, BX
	MOVQ  $0x00000000, SI
	MOVQ  $0x00000000, CX

lsh512_sse2_update_if1_end:
lsh512_sse2_update_while_start:
	CMPQ BX, $0x00000100
	JL   lsh512_sse2_update_while_end

	// compress
	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU (DX), X8
	MOVOU X8, (SP)
	MOVOU 16(DX), X8
	MOVOU X8, 16(SP)
	MOVOU 32(DX), X8
	MOVOU X8, 32(SP)
	MOVOU 48(DX), X8
	MOVOU X8, 48(SP)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 64(DX), X8
	MOVOU X8, 64(SP)
	MOVOU 80(DX), X8
	MOVOU X8, 80(SP)
	MOVOU 96(DX), X8
	MOVOU X8, 96(SP)
	MOVOU 112(DX), X8
	MOVOU X8, 112(SP)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 128(DX), X8
	MOVOU X8, 128(SP)
	MOVOU 144(DX), X8
	MOVOU X8, 144(SP)
	MOVOU 160(DX), X8
	MOVOU X8, 160(SP)
	MOVOU 176(DX), X8
	MOVOU X8, 176(SP)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 192(DX), X8
	MOVOU X8, 192(SP)
	MOVOU 208(DX), X8
	MOVOU X8, 208(SP)
	MOVOU 224(DX), X8
	MOVOU X8, 224(SP)
	MOVOU 240(DX), X8
	MOVOU X8, 240(SP)

	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+0(SB), X8
	MOVOA g_StepConstants<>+16(SB), X9
	MOVOA g_StepConstants<>+32(SB), X10
	MOVOA g_StepConstants<>+48(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+64(SB), X8
	MOVOA g_StepConstants<>+80(SB), X9
	MOVOA g_StepConstants<>+96(SB), X10
	MOVOA g_StepConstants<>+112(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+128(SB), X8
	MOVOA g_StepConstants<>+144(SB), X9
	MOVOA g_StepConstants<>+160(SB), X10
	MOVOA g_StepConstants<>+176(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+192(SB), X8
	MOVOA g_StepConstants<>+208(SB), X9
	MOVOA g_StepConstants<>+224(SB), X10
	MOVOA g_StepConstants<>+240(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+256(SB), X8
	MOVOA g_StepConstants<>+272(SB), X9
	MOVOA g_StepConstants<>+288(SB), X10
	MOVOA g_StepConstants<>+304(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+320(SB), X8
	MOVOA g_StepConstants<>+336(SB), X9
	MOVOA g_StepConstants<>+352(SB), X10
	MOVOA g_StepConstants<>+368(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+384(SB), X8
	MOVOA g_StepConstants<>+400(SB), X9
	MOVOA g_StepConstants<>+416(SB), X10
	MOVOA g_StepConstants<>+432(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+448(SB), X8
	MOVOA g_StepConstants<>+464(SB), X9
	MOVOA g_StepConstants<>+480(SB), X10
	MOVOA g_StepConstants<>+496(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+512(SB), X8
	MOVOA g_StepConstants<>+528(SB), X9
	MOVOA g_StepConstants<>+544(SB), X10
	MOVOA g_StepConstants<>+560(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+576(SB), X8
	MOVOA g_StepConstants<>+592(SB), X9
	MOVOA g_StepConstants<>+608(SB), X10
	MOVOA g_StepConstants<>+624(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+640(SB), X8
	MOVOA g_StepConstants<>+656(SB), X9
	MOVOA g_StepConstants<>+672(SB), X10
	MOVOA g_StepConstants<>+688(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+704(SB), X8
	MOVOA g_StepConstants<>+720(SB), X9
	MOVOA g_StepConstants<>+736(SB), X10
	MOVOA g_StepConstants<>+752(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+768(SB), X8
	MOVOA g_StepConstants<>+784(SB), X9
	MOVOA g_StepConstants<>+800(SB), X10
	MOVOA g_StepConstants<>+816(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+832(SB), X8
	MOVOA g_StepConstants<>+848(SB), X9
	MOVOA g_StepConstants<>+864(SB), X10
	MOVOA g_StepConstants<>+880(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+896(SB), X8
	MOVOA g_StepConstants<>+912(SB), X9
	MOVOA g_StepConstants<>+928(SB), X10
	MOVOA g_StepConstants<>+944(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+960(SB), X8
	MOVOA g_StepConstants<>+976(SB), X9
	MOVOA g_StepConstants<>+992(SB), X10
	MOVOA g_StepConstants<>+1008(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1024(SB), X8
	MOVOA g_StepConstants<>+1040(SB), X9
	MOVOA g_StepConstants<>+1056(SB), X10
	MOVOA g_StepConstants<>+1072(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1088(SB), X8
	MOVOA g_StepConstants<>+1104(SB), X9
	MOVOA g_StepConstants<>+1120(SB), X10
	MOVOA g_StepConstants<>+1136(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1152(SB), X8
	MOVOA g_StepConstants<>+1168(SB), X9
	MOVOA g_StepConstants<>+1184(SB), X10
	MOVOA g_StepConstants<>+1200(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1216(SB), X8
	MOVOA g_StepConstants<>+1232(SB), X9
	MOVOA g_StepConstants<>+1248(SB), X10
	MOVOA g_StepConstants<>+1264(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1280(SB), X8
	MOVOA g_StepConstants<>+1296(SB), X9
	MOVOA g_StepConstants<>+1312(SB), X10
	MOVOA g_StepConstants<>+1328(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1344(SB), X8
	MOVOA g_StepConstants<>+1360(SB), X9
	MOVOA g_StepConstants<>+1376(SB), X10
	MOVOA g_StepConstants<>+1392(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1408(SB), X8
	MOVOA g_StepConstants<>+1424(SB), X9
	MOVOA g_StepConstants<>+1440(SB), X10
	MOVOA g_StepConstants<>+1456(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1472(SB), X8
	MOVOA g_StepConstants<>+1488(SB), X9
	MOVOA g_StepConstants<>+1504(SB), X10
	MOVOA g_StepConstants<>+1520(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1536(SB), X8
	MOVOA g_StepConstants<>+1552(SB), X9
	MOVOA g_StepConstants<>+1568(SB), X10
	MOVOA g_StepConstants<>+1584(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1600(SB), X8
	MOVOA g_StepConstants<>+1616(SB), X9
	MOVOA g_StepConstants<>+1632(SB), X10
	MOVOA g_StepConstants<>+1648(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1664(SB), X8
	MOVOA g_StepConstants<>+1680(SB), X9
	MOVOA g_StepConstants<>+1696(SB), X10
	MOVOA g_StepConstants<>+1712(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1728(SB), X8
	MOVOA g_StepConstants<>+1744(SB), X9
	MOVOA g_StepConstants<>+1760(SB), X10
	MOVOA g_StepConstants<>+1776(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7
	ADDQ  $0x00000100, DX
	SUBQ  $0x00000100, BX
	JMP   lsh512_sse2_update_while_start

lsh512_sse2_update_while_end:
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)
	CMPQ  BX, $0x00000000
	JE    lsh512_sse2_update_if3_end

	// Memcpy
	LEAQ 144(AX), AX
	LEAQ (DX), DX
	MOVQ BX, SI

memcpy_3_sz16_start:
	CMPQ  SI, $0x00000010
	JL    memcpy_3_sz16_end
	MOVOU (DX), X0
	MOVOU X0, (AX)
	ADDQ  $0x00000010, DX
	ADDQ  $0x00000010, AX
	SUBQ  $0x00000010, SI
	JMP   memcpy_3_sz16_start

memcpy_3_sz16_end:
memcpy_3_sz8_start:
	CMPQ SI, $0x00000008
	JL   memcpy_3_sz8_end
	MOVQ (DX), CX
	MOVQ CX, (AX)
	ADDQ $0x00000008, DX
	ADDQ $0x00000008, AX
	SUBQ $0x00000008, SI
	JMP  memcpy_3_sz8_start

memcpy_3_sz8_end:
memcpy_3_sz4_start:
	CMPQ SI, $0x00000004
	JL   memcpy_3_sz4_end
	MOVL (DX), CX
	MOVL CX, (AX)
	ADDQ $0x00000004, DX
	ADDQ $0x00000004, AX
	SUBQ $0x00000004, SI
	JMP  memcpy_3_sz4_start

memcpy_3_sz4_end:
memcpy_3_sz2_start:
	CMPQ SI, $0x00000002
	JL   memcpy_3_sz2_end
	MOVW (DX), CX
	MOVW CX, (AX)
	ADDQ $0x00000002, DX
	ADDQ $0x00000002, AX
	SUBQ $0x00000002, SI
	JMP  memcpy_3_sz2_start

memcpy_3_sz2_end:
memcpy_3_sz1_start:
	CMPQ SI, $0x00000001
	JL   memcpy_3_sz1_end
	MOVB (DX), CL
	MOVB CL, (AX)
	ADDQ $0x00000001, DX
	ADDQ $0x00000001, AX
	SUBQ $0x00000001, SI
	JMP  memcpy_3_sz1_start

memcpy_3_sz1_end:
	MOVQ BX, CX

lsh512_sse2_update_if3_end:
lsh512_sse2_update_ret:
	MOVQ ctx+0(FP), AX
	MOVQ CX, 8(AX)
	RET

// func lsh512FinalSSE2(ctx *lsh512ContextAsmData, hashval []byte)
// Requires: SSE2
TEXT ·lsh512FinalSSE2(SB), NOSPLIT, $256-32
	MOVQ ctx+0(FP), AX
	MOVL (AX), CX
	MOVQ 8(AX), CX
	MOVQ hashval_base+8(FP), DX

	// lsh512_sse2_final
	MOVQ CX, BX
	MOVB $0x80, 144(AX)(BX*1)
	MOVQ $0x000000ff, SI
	SUBQ BX, SI

	// memset
	LEAQ 145(AX)(BX*1), BX
	CMPQ SI, $0x00000010
	JL   memset_1_sz16_end
	MOVO memset_value_0<>+0(SB), X0

memset_1_sz16_start:
	MOVOU X0, (BX)
	SUBQ  $0x00000010, SI
	ADDQ  $0x00000010, BX
	CMPQ  SI, $0x00000010
	JL    memset_1_sz16_end
	JMP   memset_1_sz16_start

memset_1_sz16_end:
	CMPQ SI, $0x00000008
	JL   memset_1_sz8_end
	MOVQ memset_value_0<>+0(SB), DI

memset_1_sz8_start:
	MOVQ DI, (BX)
	SUBQ $0x00000008, SI
	ADDQ $0x00000008, BX
	CMPQ SI, $0x00000008
	JL   memset_1_sz8_end
	JMP  memset_1_sz8_start

memset_1_sz8_end:
	CMPQ SI, $0x00000004
	JL   memset_1_sz4_end
	MOVL memset_value_0<>+0(SB), DI

memset_1_sz4_start:
	MOVL DI, (BX)
	SUBQ $0x00000004, SI
	ADDQ $0x00000004, BX
	CMPQ SI, $0x00000004
	JL   memset_1_sz4_end
	JMP  memset_1_sz4_start

memset_1_sz4_end:
	CMPQ SI, $0x00000002
	JL   memset_1_sz2_end
	MOVW memset_value_0<>+0(SB), DI

memset_1_sz2_start:
	MOVW DI, (BX)
	SUBQ $0x00000002, SI
	ADDQ $0x00000002, BX
	CMPQ SI, $0x00000002
	JL   memset_1_sz2_end
	JMP  memset_1_sz2_start

memset_1_sz2_end:
memset_1_1_start:
	CMPQ SI, $0x00000000
	JE   memset_1_1_end
	MOVB $0x00, (BX)
	SUBQ $0x00000001, SI
	ADDQ $0x00000001, BX
	JMP  memset_1_1_start

memset_1_1_end:
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// compress
	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 144(AX), X8
	MOVOU X8, (SP)
	MOVOU 160(AX), X8
	MOVOU X8, 16(SP)
	MOVOU 176(AX), X8
	MOVOU X8, 32(SP)
	MOVOU 192(AX), X8
	MOVOU X8, 48(SP)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 208(AX), X8
	MOVOU X8, 64(SP)
	MOVOU 224(AX), X8
	MOVOU X8, 80(SP)
	MOVOU 240(AX), X8
	MOVOU X8, 96(SP)
	MOVOU 256(AX), X8
	MOVOU X8, 112(SP)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 272(AX), X8
	MOVOU X8, 128(SP)
	MOVOU 288(AX), X8
	MOVOU X8, 144(SP)
	MOVOU 304(AX), X8
	MOVOU X8, 160(SP)
	MOVOU 320(AX), X8
	MOVOU X8, 176(SP)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 336(AX), X8
	MOVOU X8, 192(SP)
	MOVOU 352(AX), X8
	MOVOU X8, 208(SP)
	MOVOU 368(AX), X8
	MOVOU X8, 224(SP)
	MOVOU 384(AX), X8
	MOVOU X8, 240(SP)

	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+0(SB), X8
	MOVOA g_StepConstants<>+16(SB), X9
	MOVOA g_StepConstants<>+32(SB), X10
	MOVOA g_StepConstants<>+48(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+64(SB), X8
	MOVOA g_StepConstants<>+80(SB), X9
	MOVOA g_StepConstants<>+96(SB), X10
	MOVOA g_StepConstants<>+112(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+128(SB), X8
	MOVOA g_StepConstants<>+144(SB), X9
	MOVOA g_StepConstants<>+160(SB), X10
	MOVOA g_StepConstants<>+176(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+192(SB), X8
	MOVOA g_StepConstants<>+208(SB), X9
	MOVOA g_StepConstants<>+224(SB), X10
	MOVOA g_StepConstants<>+240(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+256(SB), X8
	MOVOA g_StepConstants<>+272(SB), X9
	MOVOA g_StepConstants<>+288(SB), X10
	MOVOA g_StepConstants<>+304(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+320(SB), X8
	MOVOA g_StepConstants<>+336(SB), X9
	MOVOA g_StepConstants<>+352(SB), X10
	MOVOA g_StepConstants<>+368(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+384(SB), X8
	MOVOA g_StepConstants<>+400(SB), X9
	MOVOA g_StepConstants<>+416(SB), X10
	MOVOA g_StepConstants<>+432(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+448(SB), X8
	MOVOA g_StepConstants<>+464(SB), X9
	MOVOA g_StepConstants<>+480(SB), X10
	MOVOA g_StepConstants<>+496(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+512(SB), X8
	MOVOA g_StepConstants<>+528(SB), X9
	MOVOA g_StepConstants<>+544(SB), X10
	MOVOA g_StepConstants<>+560(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+576(SB), X8
	MOVOA g_StepConstants<>+592(SB), X9
	MOVOA g_StepConstants<>+608(SB), X10
	MOVOA g_StepConstants<>+624(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+640(SB), X8
	MOVOA g_StepConstants<>+656(SB), X9
	MOVOA g_StepConstants<>+672(SB), X10
	MOVOA g_StepConstants<>+688(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+704(SB), X8
	MOVOA g_StepConstants<>+720(SB), X9
	MOVOA g_StepConstants<>+736(SB), X10
	MOVOA g_StepConstants<>+752(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+768(SB), X8
	MOVOA g_StepConstants<>+784(SB), X9
	MOVOA g_StepConstants<>+800(SB), X10
	MOVOA g_StepConstants<>+816(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+832(SB), X8
	MOVOA g_StepConstants<>+848(SB), X9
	MOVOA g_StepConstants<>+864(SB), X10
	MOVOA g_StepConstants<>+880(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+896(SB), X8
	MOVOA g_StepConstants<>+912(SB), X9
	MOVOA g_StepConstants<>+928(SB), X10
	MOVOA g_StepConstants<>+944(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+960(SB), X8
	MOVOA g_StepConstants<>+976(SB), X9
	MOVOA g_StepConstants<>+992(SB), X10
	MOVOA g_StepConstants<>+1008(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1024(SB), X8
	MOVOA g_StepConstants<>+1040(SB), X9
	MOVOA g_StepConstants<>+1056(SB), X10
	MOVOA g_StepConstants<>+1072(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1088(SB), X8
	MOVOA g_StepConstants<>+1104(SB), X9
	MOVOA g_StepConstants<>+1120(SB), X10
	MOVOA g_StepConstants<>+1136(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1152(SB), X8
	MOVOA g_StepConstants<>+1168(SB), X9
	MOVOA g_StepConstants<>+1184(SB), X10
	MOVOA g_StepConstants<>+1200(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1216(SB), X8
	MOVOA g_StepConstants<>+1232(SB), X9
	MOVOA g_StepConstants<>+1248(SB), X10
	MOVOA g_StepConstants<>+1264(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1280(SB), X8
	MOVOA g_StepConstants<>+1296(SB), X9
	MOVOA g_StepConstants<>+1312(SB), X10
	MOVOA g_StepConstants<>+1328(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1344(SB), X8
	MOVOA g_StepConstants<>+1360(SB), X9
	MOVOA g_StepConstants<>+1376(SB), X10
	MOVOA g_StepConstants<>+1392(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1408(SB), X8
	MOVOA g_StepConstants<>+1424(SB), X9
	MOVOA g_StepConstants<>+1440(SB), X10
	MOVOA g_StepConstants<>+1456(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1472(SB), X8
	MOVOA g_StepConstants<>+1488(SB), X9
	MOVOA g_StepConstants<>+1504(SB), X10
	MOVOA g_StepConstants<>+1520(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1536(SB), X8
	MOVOA g_StepConstants<>+1552(SB), X9
	MOVOA g_StepConstants<>+1568(SB), X10
	MOVOA g_StepConstants<>+1584(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1600(SB), X8
	MOVOA g_StepConstants<>+1616(SB), X9
	MOVOA g_StepConstants<>+1632(SB), X10
	MOVOA g_StepConstants<>+1648(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1664(SB), X8
	MOVOA g_StepConstants<>+1680(SB), X9
	MOVOA g_StepConstants<>+1696(SB), X10
	MOVOA g_StepConstants<>+1712(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1728(SB), X8
	MOVOA g_StepConstants<>+1744(SB), X9
	MOVOA g_StepConstants<>+1760(SB), X10
	MOVOA g_StepConstants<>+1776(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	MOVOA X4, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X4
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X4
	MOVOA X5, X9
	PSLLQ $+32, X9
	PSRLQ $+32, X5
	PXOR  X9, X5
	MOVOA X5, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X5
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X5
	MOVOA X6, X9
	PSLLQ $+8, X9
	PSRLQ $+56, X6
	PXOR  X9, X6
	MOVOA X6, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X6
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X6
	MOVOA X7, X9
	PSLLQ $+40, X9
	PSRLQ $+24, X7
	PXOR  X9, X7
	MOVOA X7, X9
	PAND  g_BytePermInfo_sse2<>+0(SB), X9
	PAND  g_BytePermInfo_sse2<>+16(SB), X7
	MOVOA X9, X8
	PSLLQ $+16, X8
	PSRLQ $+48, X9
	PXOR  X8, X9
	PXOR  X9, X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// fin
	PXOR X4, X0
	PXOR X5, X1
	PXOR X6, X2
	PXOR X7, X3

	// get_hash
	// store_blk
	MOVOU X0, (DX)
	MOVOU X1, 16(DX)
	MOVOU X2, 32(DX)
	MOVOU X3, 48(DX)
	MOVQ  ctx+0(FP), AX
	MOVQ  CX, 8(AX)
	RET

DATA memset_value_0<>+0(SB)/8, $0x0000000000000000
DATA memset_value_0<>+8(SB)/8, $0x0000000000000000
DATA memset_value_0<>+16(SB)/8, $0x0000000000000000
DATA memset_value_0<>+24(SB)/8, $0x0000000000000000
GLOBL memset_value_0<>(SB), RODATA|NOPTR, $32

// func lsh512UpdateSSSE3(ctx *lsh512ContextAsmData, data []byte)
// Requires: SSE2, SSSE3
TEXT ·lsh512UpdateSSSE3(SB), NOSPLIT, $256-32
	MOVQ ctx+0(FP), AX
	MOVL (AX), CX
	MOVQ 8(AX), CX
	MOVQ data_base+8(FP), DX
	MOVQ data_len+16(FP), BX

	// lsh512_ssse3_update
	MOVQ CX, SI
	MOVQ BX, DI
	ADDQ SI, DI
	CMPQ DI, $0x00000100
	JGE  lsh512_ssse3_update_if0_end

	// Memcpy
	LEAQ 144(AX)(SI*1), AX
	LEAQ (DX), DI
	MOVQ BX, R8

memcpy_4_sz16_start:
	CMPQ  R8, $0x00000010
	JL    memcpy_4_sz16_end
	MOVOU (DI), X0
	MOVOU X0, (AX)
	ADDQ  $0x00000010, DI
	ADDQ  $0x00000010, AX
	SUBQ  $0x00000010, R8
	JMP   memcpy_4_sz16_start

memcpy_4_sz16_end:
memcpy_4_sz8_start:
	CMPQ R8, $0x00000008
	JL   memcpy_4_sz8_end
	MOVQ (DI), DX
	MOVQ DX, (AX)
	ADDQ $0x00000008, DI
	ADDQ $0x00000008, AX
	SUBQ $0x00000008, R8
	JMP  memcpy_4_sz8_start

memcpy_4_sz8_end:
memcpy_4_sz4_start:
	CMPQ R8, $0x00000004
	JL   memcpy_4_sz4_end
	MOVL (DI), DX
	MOVL DX, (AX)
	ADDQ $0x00000004, DI
	ADDQ $0x00000004, AX
	SUBQ $0x00000004, R8
	JMP  memcpy_4_sz4_start

memcpy_4_sz4_end:
memcpy_4_sz2_start:
	CMPQ R8, $0x00000002
	JL   memcpy_4_sz2_end
	MOVW (DI), DX
	MOVW DX, (AX)
	ADDQ $0x00000002, DI
	ADDQ $0x00000002, AX
	SUBQ $0x00000002, R8
	JMP  memcpy_4_sz2_start

memcpy_4_sz2_end:
memcpy_4_sz1_start:
	CMPQ R8, $0x00000001
	JL   memcpy_4_sz1_end
	MOVB (DI), DL
	MOVB DL, (AX)
	ADDQ $0x00000001, DI
	ADDQ $0x00000001, AX
	SUBQ $0x00000001, R8
	JMP  memcpy_4_sz1_start

memcpy_4_sz1_end:
	ADDQ BX, CX
	ADDQ BX, SI
	JMP  lsh512_ssse3_update_ret

lsh512_ssse3_update_if0_end:
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7
	CMPQ  SI, $0x00000000
	JE    lsh512_ssse3_update_if1_end
	MOVQ  $0x00000100, CX
	SUBQ  SI, CX

	// Memcpy
	LEAQ 144(AX)(SI*1), DI
	LEAQ (DX), R8
	MOVQ CX, R9

memcpy_5_sz16_start:
	CMPQ  R9, $0x00000010
	JL    memcpy_5_sz16_end
	MOVOU (R8), X8
	MOVOU X8, (DI)
	ADDQ  $0x00000010, R8
	ADDQ  $0x00000010, DI
	SUBQ  $0x00000010, R9
	JMP   memcpy_5_sz16_start

memcpy_5_sz16_end:
memcpy_5_sz8_start:
	CMPQ R9, $0x00000008
	JL   memcpy_5_sz8_end
	MOVQ (R8), SI
	MOVQ SI, (DI)
	ADDQ $0x00000008, R8
	ADDQ $0x00000008, DI
	SUBQ $0x00000008, R9
	JMP  memcpy_5_sz8_start

memcpy_5_sz8_end:
memcpy_5_sz4_start:
	CMPQ R9, $0x00000004
	JL   memcpy_5_sz4_end
	MOVL (R8), SI
	MOVL SI, (DI)
	ADDQ $0x00000004, R8
	ADDQ $0x00000004, DI
	SUBQ $0x00000004, R9
	JMP  memcpy_5_sz4_start

memcpy_5_sz4_end:
memcpy_5_sz2_start:
	CMPQ R9, $0x00000002
	JL   memcpy_5_sz2_end
	MOVW (R8), SI
	MOVW SI, (DI)
	ADDQ $0x00000002, R8
	ADDQ $0x00000002, DI
	SUBQ $0x00000002, R9
	JMP  memcpy_5_sz2_start

memcpy_5_sz2_end:
memcpy_5_sz1_start:
	CMPQ R9, $0x00000001
	JL   memcpy_5_sz1_end
	MOVB (R8), SI
	MOVB SI, (DI)
	ADDQ $0x00000001, R8
	ADDQ $0x00000001, DI
	SUBQ $0x00000001, R9
	JMP  memcpy_5_sz1_start

memcpy_5_sz1_end:
	// compress
	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 144(AX), X8
	MOVOU X8, (SP)
	MOVOU 160(AX), X8
	MOVOU X8, 16(SP)
	MOVOU 176(AX), X8
	MOVOU X8, 32(SP)
	MOVOU 192(AX), X8
	MOVOU X8, 48(SP)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 208(AX), X8
	MOVOU X8, 64(SP)
	MOVOU 224(AX), X8
	MOVOU X8, 80(SP)
	MOVOU 240(AX), X8
	MOVOU X8, 96(SP)
	MOVOU 256(AX), X8
	MOVOU X8, 112(SP)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 272(AX), X8
	MOVOU X8, 128(SP)
	MOVOU 288(AX), X8
	MOVOU X8, 144(SP)
	MOVOU 304(AX), X8
	MOVOU X8, 160(SP)
	MOVOU 320(AX), X8
	MOVOU X8, 176(SP)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 336(AX), X8
	MOVOU X8, 192(SP)
	MOVOU 352(AX), X8
	MOVOU X8, 208(SP)
	MOVOU 368(AX), X8
	MOVOU X8, 224(SP)
	MOVOU 384(AX), X8
	MOVOU X8, 240(SP)

	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+0(SB), X8
	MOVOA g_StepConstants<>+16(SB), X9
	MOVOA g_StepConstants<>+32(SB), X10
	MOVOA g_StepConstants<>+48(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+64(SB), X8
	MOVOA g_StepConstants<>+80(SB), X9
	MOVOA g_StepConstants<>+96(SB), X10
	MOVOA g_StepConstants<>+112(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+128(SB), X8
	MOVOA g_StepConstants<>+144(SB), X9
	MOVOA g_StepConstants<>+160(SB), X10
	MOVOA g_StepConstants<>+176(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+192(SB), X8
	MOVOA g_StepConstants<>+208(SB), X9
	MOVOA g_StepConstants<>+224(SB), X10
	MOVOA g_StepConstants<>+240(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+256(SB), X8
	MOVOA g_StepConstants<>+272(SB), X9
	MOVOA g_StepConstants<>+288(SB), X10
	MOVOA g_StepConstants<>+304(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+320(SB), X8
	MOVOA g_StepConstants<>+336(SB), X9
	MOVOA g_StepConstants<>+352(SB), X10
	MOVOA g_StepConstants<>+368(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+384(SB), X8
	MOVOA g_StepConstants<>+400(SB), X9
	MOVOA g_StepConstants<>+416(SB), X10
	MOVOA g_StepConstants<>+432(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+448(SB), X8
	MOVOA g_StepConstants<>+464(SB), X9
	MOVOA g_StepConstants<>+480(SB), X10
	MOVOA g_StepConstants<>+496(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+512(SB), X8
	MOVOA g_StepConstants<>+528(SB), X9
	MOVOA g_StepConstants<>+544(SB), X10
	MOVOA g_StepConstants<>+560(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+576(SB), X8
	MOVOA g_StepConstants<>+592(SB), X9
	MOVOA g_StepConstants<>+608(SB), X10
	MOVOA g_StepConstants<>+624(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+640(SB), X8
	MOVOA g_StepConstants<>+656(SB), X9
	MOVOA g_StepConstants<>+672(SB), X10
	MOVOA g_StepConstants<>+688(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+704(SB), X8
	MOVOA g_StepConstants<>+720(SB), X9
	MOVOA g_StepConstants<>+736(SB), X10
	MOVOA g_StepConstants<>+752(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+768(SB), X8
	MOVOA g_StepConstants<>+784(SB), X9
	MOVOA g_StepConstants<>+800(SB), X10
	MOVOA g_StepConstants<>+816(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+832(SB), X8
	MOVOA g_StepConstants<>+848(SB), X9
	MOVOA g_StepConstants<>+864(SB), X10
	MOVOA g_StepConstants<>+880(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+896(SB), X8
	MOVOA g_StepConstants<>+912(SB), X9
	MOVOA g_StepConstants<>+928(SB), X10
	MOVOA g_StepConstants<>+944(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+960(SB), X8
	MOVOA g_StepConstants<>+976(SB), X9
	MOVOA g_StepConstants<>+992(SB), X10
	MOVOA g_StepConstants<>+1008(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1024(SB), X8
	MOVOA g_StepConstants<>+1040(SB), X9
	MOVOA g_StepConstants<>+1056(SB), X10
	MOVOA g_StepConstants<>+1072(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1088(SB), X8
	MOVOA g_StepConstants<>+1104(SB), X9
	MOVOA g_StepConstants<>+1120(SB), X10
	MOVOA g_StepConstants<>+1136(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1152(SB), X8
	MOVOA g_StepConstants<>+1168(SB), X9
	MOVOA g_StepConstants<>+1184(SB), X10
	MOVOA g_StepConstants<>+1200(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1216(SB), X8
	MOVOA g_StepConstants<>+1232(SB), X9
	MOVOA g_StepConstants<>+1248(SB), X10
	MOVOA g_StepConstants<>+1264(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1280(SB), X8
	MOVOA g_StepConstants<>+1296(SB), X9
	MOVOA g_StepConstants<>+1312(SB), X10
	MOVOA g_StepConstants<>+1328(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1344(SB), X8
	MOVOA g_StepConstants<>+1360(SB), X9
	MOVOA g_StepConstants<>+1376(SB), X10
	MOVOA g_StepConstants<>+1392(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1408(SB), X8
	MOVOA g_StepConstants<>+1424(SB), X9
	MOVOA g_StepConstants<>+1440(SB), X10
	MOVOA g_StepConstants<>+1456(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1472(SB), X8
	MOVOA g_StepConstants<>+1488(SB), X9
	MOVOA g_StepConstants<>+1504(SB), X10
	MOVOA g_StepConstants<>+1520(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1536(SB), X8
	MOVOA g_StepConstants<>+1552(SB), X9
	MOVOA g_StepConstants<>+1568(SB), X10
	MOVOA g_StepConstants<>+1584(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1600(SB), X8
	MOVOA g_StepConstants<>+1616(SB), X9
	MOVOA g_StepConstants<>+1632(SB), X10
	MOVOA g_StepConstants<>+1648(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1664(SB), X8
	MOVOA g_StepConstants<>+1680(SB), X9
	MOVOA g_StepConstants<>+1696(SB), X10
	MOVOA g_StepConstants<>+1712(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1728(SB), X8
	MOVOA g_StepConstants<>+1744(SB), X9
	MOVOA g_StepConstants<>+1760(SB), X10
	MOVOA g_StepConstants<>+1776(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7
	ADDQ  CX, DX
	SUBQ  CX, BX
	MOVQ  $0x00000000, SI
	MOVQ  $0x00000000, CX

lsh512_ssse3_update_if1_end:
lsh512_ssse3_update_while_start:
	CMPQ BX, $0x00000100
	JL   lsh512_ssse3_update_while_end

	// compress
	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU (DX), X8
	MOVOU X8, (SP)
	MOVOU 16(DX), X8
	MOVOU X8, 16(SP)
	MOVOU 32(DX), X8
	MOVOU X8, 32(SP)
	MOVOU 48(DX), X8
	MOVOU X8, 48(SP)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 64(DX), X8
	MOVOU X8, 64(SP)
	MOVOU 80(DX), X8
	MOVOU X8, 80(SP)
	MOVOU 96(DX), X8
	MOVOU X8, 96(SP)
	MOVOU 112(DX), X8
	MOVOU X8, 112(SP)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 128(DX), X8
	MOVOU X8, 128(SP)
	MOVOU 144(DX), X8
	MOVOU X8, 144(SP)
	MOVOU 160(DX), X8
	MOVOU X8, 160(SP)
	MOVOU 176(DX), X8
	MOVOU X8, 176(SP)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 192(DX), X8
	MOVOU X8, 192(SP)
	MOVOU 208(DX), X8
	MOVOU X8, 208(SP)
	MOVOU 224(DX), X8
	MOVOU X8, 224(SP)
	MOVOU 240(DX), X8
	MOVOU X8, 240(SP)

	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+0(SB), X8
	MOVOA g_StepConstants<>+16(SB), X9
	MOVOA g_StepConstants<>+32(SB), X10
	MOVOA g_StepConstants<>+48(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+64(SB), X8
	MOVOA g_StepConstants<>+80(SB), X9
	MOVOA g_StepConstants<>+96(SB), X10
	MOVOA g_StepConstants<>+112(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+128(SB), X8
	MOVOA g_StepConstants<>+144(SB), X9
	MOVOA g_StepConstants<>+160(SB), X10
	MOVOA g_StepConstants<>+176(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+192(SB), X8
	MOVOA g_StepConstants<>+208(SB), X9
	MOVOA g_StepConstants<>+224(SB), X10
	MOVOA g_StepConstants<>+240(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+256(SB), X8
	MOVOA g_StepConstants<>+272(SB), X9
	MOVOA g_StepConstants<>+288(SB), X10
	MOVOA g_StepConstants<>+304(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+320(SB), X8
	MOVOA g_StepConstants<>+336(SB), X9
	MOVOA g_StepConstants<>+352(SB), X10
	MOVOA g_StepConstants<>+368(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+384(SB), X8
	MOVOA g_StepConstants<>+400(SB), X9
	MOVOA g_StepConstants<>+416(SB), X10
	MOVOA g_StepConstants<>+432(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+448(SB), X8
	MOVOA g_StepConstants<>+464(SB), X9
	MOVOA g_StepConstants<>+480(SB), X10
	MOVOA g_StepConstants<>+496(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+512(SB), X8
	MOVOA g_StepConstants<>+528(SB), X9
	MOVOA g_StepConstants<>+544(SB), X10
	MOVOA g_StepConstants<>+560(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+576(SB), X8
	MOVOA g_StepConstants<>+592(SB), X9
	MOVOA g_StepConstants<>+608(SB), X10
	MOVOA g_StepConstants<>+624(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+640(SB), X8
	MOVOA g_StepConstants<>+656(SB), X9
	MOVOA g_StepConstants<>+672(SB), X10
	MOVOA g_StepConstants<>+688(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+704(SB), X8
	MOVOA g_StepConstants<>+720(SB), X9
	MOVOA g_StepConstants<>+736(SB), X10
	MOVOA g_StepConstants<>+752(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+768(SB), X8
	MOVOA g_StepConstants<>+784(SB), X9
	MOVOA g_StepConstants<>+800(SB), X10
	MOVOA g_StepConstants<>+816(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+832(SB), X8
	MOVOA g_StepConstants<>+848(SB), X9
	MOVOA g_StepConstants<>+864(SB), X10
	MOVOA g_StepConstants<>+880(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+896(SB), X8
	MOVOA g_StepConstants<>+912(SB), X9
	MOVOA g_StepConstants<>+928(SB), X10
	MOVOA g_StepConstants<>+944(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+960(SB), X8
	MOVOA g_StepConstants<>+976(SB), X9
	MOVOA g_StepConstants<>+992(SB), X10
	MOVOA g_StepConstants<>+1008(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1024(SB), X8
	MOVOA g_StepConstants<>+1040(SB), X9
	MOVOA g_StepConstants<>+1056(SB), X10
	MOVOA g_StepConstants<>+1072(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1088(SB), X8
	MOVOA g_StepConstants<>+1104(SB), X9
	MOVOA g_StepConstants<>+1120(SB), X10
	MOVOA g_StepConstants<>+1136(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1152(SB), X8
	MOVOA g_StepConstants<>+1168(SB), X9
	MOVOA g_StepConstants<>+1184(SB), X10
	MOVOA g_StepConstants<>+1200(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1216(SB), X8
	MOVOA g_StepConstants<>+1232(SB), X9
	MOVOA g_StepConstants<>+1248(SB), X10
	MOVOA g_StepConstants<>+1264(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1280(SB), X8
	MOVOA g_StepConstants<>+1296(SB), X9
	MOVOA g_StepConstants<>+1312(SB), X10
	MOVOA g_StepConstants<>+1328(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1344(SB), X8
	MOVOA g_StepConstants<>+1360(SB), X9
	MOVOA g_StepConstants<>+1376(SB), X10
	MOVOA g_StepConstants<>+1392(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1408(SB), X8
	MOVOA g_StepConstants<>+1424(SB), X9
	MOVOA g_StepConstants<>+1440(SB), X10
	MOVOA g_StepConstants<>+1456(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1472(SB), X8
	MOVOA g_StepConstants<>+1488(SB), X9
	MOVOA g_StepConstants<>+1504(SB), X10
	MOVOA g_StepConstants<>+1520(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1536(SB), X8
	MOVOA g_StepConstants<>+1552(SB), X9
	MOVOA g_StepConstants<>+1568(SB), X10
	MOVOA g_StepConstants<>+1584(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1600(SB), X8
	MOVOA g_StepConstants<>+1616(SB), X9
	MOVOA g_StepConstants<>+1632(SB), X10
	MOVOA g_StepConstants<>+1648(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1664(SB), X8
	MOVOA g_StepConstants<>+1680(SB), X9
	MOVOA g_StepConstants<>+1696(SB), X10
	MOVOA g_StepConstants<>+1712(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1728(SB), X8
	MOVOA g_StepConstants<>+1744(SB), X9
	MOVOA g_StepConstants<>+1760(SB), X10
	MOVOA g_StepConstants<>+1776(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7
	ADDQ  $0x00000100, DX
	SUBQ  $0x00000100, BX
	JMP   lsh512_ssse3_update_while_start

lsh512_ssse3_update_while_end:
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)
	CMPQ  BX, $0x00000000
	JE    lsh512_ssse3_update_if3_end

	// Memcpy
	LEAQ 144(AX), AX
	LEAQ (DX), DX
	MOVQ BX, SI

memcpy_6_sz16_start:
	CMPQ  SI, $0x00000010
	JL    memcpy_6_sz16_end
	MOVOU (DX), X0
	MOVOU X0, (AX)
	ADDQ  $0x00000010, DX
	ADDQ  $0x00000010, AX
	SUBQ  $0x00000010, SI
	JMP   memcpy_6_sz16_start

memcpy_6_sz16_end:
memcpy_6_sz8_start:
	CMPQ SI, $0x00000008
	JL   memcpy_6_sz8_end
	MOVQ (DX), CX
	MOVQ CX, (AX)
	ADDQ $0x00000008, DX
	ADDQ $0x00000008, AX
	SUBQ $0x00000008, SI
	JMP  memcpy_6_sz8_start

memcpy_6_sz8_end:
memcpy_6_sz4_start:
	CMPQ SI, $0x00000004
	JL   memcpy_6_sz4_end
	MOVL (DX), CX
	MOVL CX, (AX)
	ADDQ $0x00000004, DX
	ADDQ $0x00000004, AX
	SUBQ $0x00000004, SI
	JMP  memcpy_6_sz4_start

memcpy_6_sz4_end:
memcpy_6_sz2_start:
	CMPQ SI, $0x00000002
	JL   memcpy_6_sz2_end
	MOVW (DX), CX
	MOVW CX, (AX)
	ADDQ $0x00000002, DX
	ADDQ $0x00000002, AX
	SUBQ $0x00000002, SI
	JMP  memcpy_6_sz2_start

memcpy_6_sz2_end:
memcpy_6_sz1_start:
	CMPQ SI, $0x00000001
	JL   memcpy_6_sz1_end
	MOVB (DX), CL
	MOVB CL, (AX)
	ADDQ $0x00000001, DX
	ADDQ $0x00000001, AX
	SUBQ $0x00000001, SI
	JMP  memcpy_6_sz1_start

memcpy_6_sz1_end:
	MOVQ BX, CX

lsh512_ssse3_update_if3_end:
lsh512_ssse3_update_ret:
	MOVQ ctx+0(FP), AX
	MOVQ CX, 8(AX)
	RET

// func lsh512FinalSSSE3(ctx *lsh512ContextAsmData, hashval []byte)
// Requires: SSE2, SSSE3
TEXT ·lsh512FinalSSSE3(SB), NOSPLIT, $256-32
	MOVQ ctx+0(FP), AX
	MOVL (AX), CX
	MOVQ 8(AX), CX
	MOVQ hashval_base+8(FP), DX

	// lsh512_ssse3_final
	MOVQ CX, BX
	MOVB $0x80, 144(AX)(BX*1)
	MOVQ $0x000000ff, SI
	SUBQ BX, SI

	// memset
	LEAQ 145(AX)(BX*1), BX
	CMPQ SI, $0x00000010
	JL   memset_2_sz16_end
	MOVO memset_value_0<>+0(SB), X0

memset_2_sz16_start:
	MOVOU X0, (BX)
	SUBQ  $0x00000010, SI
	ADDQ  $0x00000010, BX
	CMPQ  SI, $0x00000010
	JL    memset_2_sz16_end
	JMP   memset_2_sz16_start

memset_2_sz16_end:
	CMPQ SI, $0x00000008
	JL   memset_2_sz8_end
	MOVQ memset_value_0<>+0(SB), DI

memset_2_sz8_start:
	MOVQ DI, (BX)
	SUBQ $0x00000008, SI
	ADDQ $0x00000008, BX
	CMPQ SI, $0x00000008
	JL   memset_2_sz8_end
	JMP  memset_2_sz8_start

memset_2_sz8_end:
	CMPQ SI, $0x00000004
	JL   memset_2_sz4_end
	MOVL memset_value_0<>+0(SB), DI

memset_2_sz4_start:
	MOVL DI, (BX)
	SUBQ $0x00000004, SI
	ADDQ $0x00000004, BX
	CMPQ SI, $0x00000004
	JL   memset_2_sz4_end
	JMP  memset_2_sz4_start

memset_2_sz4_end:
	CMPQ SI, $0x00000002
	JL   memset_2_sz2_end
	MOVW memset_value_0<>+0(SB), DI

memset_2_sz2_start:
	MOVW DI, (BX)
	SUBQ $0x00000002, SI
	ADDQ $0x00000002, BX
	CMPQ SI, $0x00000002
	JL   memset_2_sz2_end
	JMP  memset_2_sz2_start

memset_2_sz2_end:
memset_2_1_start:
	CMPQ SI, $0x00000000
	JE   memset_2_1_end
	MOVB $0x00, (BX)
	SUBQ $0x00000001, SI
	ADDQ $0x00000001, BX
	JMP  memset_2_1_start

memset_2_1_end:
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// compress
	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 144(AX), X8
	MOVOU X8, (SP)
	MOVOU 160(AX), X8
	MOVOU X8, 16(SP)
	MOVOU 176(AX), X8
	MOVOU X8, 32(SP)
	MOVOU 192(AX), X8
	MOVOU X8, 48(SP)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 208(AX), X8
	MOVOU X8, 64(SP)
	MOVOU 224(AX), X8
	MOVOU X8, 80(SP)
	MOVOU 240(AX), X8
	MOVOU X8, 96(SP)
	MOVOU 256(AX), X8
	MOVOU X8, 112(SP)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 272(AX), X8
	MOVOU X8, 128(SP)
	MOVOU 288(AX), X8
	MOVOU X8, 144(SP)
	MOVOU 304(AX), X8
	MOVOU X8, 160(SP)
	MOVOU 320(AX), X8
	MOVOU X8, 176(SP)

	// load_blk_mem2mem
	// MemcpyStatic
	MOVOU 336(AX), X8
	MOVOU X8, 192(SP)
	MOVOU 352(AX), X8
	MOVOU X8, 208(SP)
	MOVOU 368(AX), X8
	MOVOU X8, 224(SP)
	MOVOU 384(AX), X8
	MOVOU X8, 240(SP)

	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+0(SB), X8
	MOVOA g_StepConstants<>+16(SB), X9
	MOVOA g_StepConstants<>+32(SB), X10
	MOVOA g_StepConstants<>+48(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+64(SB), X8
	MOVOA g_StepConstants<>+80(SB), X9
	MOVOA g_StepConstants<>+96(SB), X10
	MOVOA g_StepConstants<>+112(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+128(SB), X8
	MOVOA g_StepConstants<>+144(SB), X9
	MOVOA g_StepConstants<>+160(SB), X10
	MOVOA g_StepConstants<>+176(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+192(SB), X8
	MOVOA g_StepConstants<>+208(SB), X9
	MOVOA g_StepConstants<>+224(SB), X10
	MOVOA g_StepConstants<>+240(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+256(SB), X8
	MOVOA g_StepConstants<>+272(SB), X9
	MOVOA g_StepConstants<>+288(SB), X10
	MOVOA g_StepConstants<>+304(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+320(SB), X8
	MOVOA g_StepConstants<>+336(SB), X9
	MOVOA g_StepConstants<>+352(SB), X10
	MOVOA g_StepConstants<>+368(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+384(SB), X8
	MOVOA g_StepConstants<>+400(SB), X9
	MOVOA g_StepConstants<>+416(SB), X10
	MOVOA g_StepConstants<>+432(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+448(SB), X8
	MOVOA g_StepConstants<>+464(SB), X9
	MOVOA g_StepConstants<>+480(SB), X10
	MOVOA g_StepConstants<>+496(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+512(SB), X8
	MOVOA g_StepConstants<>+528(SB), X9
	MOVOA g_StepConstants<>+544(SB), X10
	MOVOA g_StepConstants<>+560(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+576(SB), X8
	MOVOA g_StepConstants<>+592(SB), X9
	MOVOA g_StepConstants<>+608(SB), X10
	MOVOA g_StepConstants<>+624(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+640(SB), X8
	MOVOA g_StepConstants<>+656(SB), X9
	MOVOA g_StepConstants<>+672(SB), X10
	MOVOA g_StepConstants<>+688(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+704(SB), X8
	MOVOA g_StepConstants<>+720(SB), X9
	MOVOA g_StepConstants<>+736(SB), X10
	MOVOA g_StepConstants<>+752(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+768(SB), X8
	MOVOA g_StepConstants<>+784(SB), X9
	MOVOA g_StepConstants<>+800(SB), X10
	MOVOA g_StepConstants<>+816(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+832(SB), X8
	MOVOA g_StepConstants<>+848(SB), X9
	MOVOA g_StepConstants<>+864(SB), X10
	MOVOA g_StepConstants<>+880(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+896(SB), X8
	MOVOA g_StepConstants<>+912(SB), X9
	MOVOA g_StepConstants<>+928(SB), X10
	MOVOA g_StepConstants<>+944(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+960(SB), X8
	MOVOA g_StepConstants<>+976(SB), X9
	MOVOA g_StepConstants<>+992(SB), X10
	MOVOA g_StepConstants<>+1008(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1024(SB), X8
	MOVOA g_StepConstants<>+1040(SB), X9
	MOVOA g_StepConstants<>+1056(SB), X10
	MOVOA g_StepConstants<>+1072(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1088(SB), X8
	MOVOA g_StepConstants<>+1104(SB), X9
	MOVOA g_StepConstants<>+1120(SB), X10
	MOVOA g_StepConstants<>+1136(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1152(SB), X8
	MOVOA g_StepConstants<>+1168(SB), X9
	MOVOA g_StepConstants<>+1184(SB), X10
	MOVOA g_StepConstants<>+1200(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1216(SB), X8
	MOVOA g_StepConstants<>+1232(SB), X9
	MOVOA g_StepConstants<>+1248(SB), X10
	MOVOA g_StepConstants<>+1264(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1280(SB), X8
	MOVOA g_StepConstants<>+1296(SB), X9
	MOVOA g_StepConstants<>+1312(SB), X10
	MOVOA g_StepConstants<>+1328(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1344(SB), X8
	MOVOA g_StepConstants<>+1360(SB), X9
	MOVOA g_StepConstants<>+1376(SB), X10
	MOVOA g_StepConstants<>+1392(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1408(SB), X8
	MOVOA g_StepConstants<>+1424(SB), X9
	MOVOA g_StepConstants<>+1440(SB), X10
	MOVOA g_StepConstants<>+1456(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1472(SB), X8
	MOVOA g_StepConstants<>+1488(SB), X9
	MOVOA g_StepConstants<>+1504(SB), X10
	MOVOA g_StepConstants<>+1520(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1536(SB), X8
	MOVOA g_StepConstants<>+1552(SB), X9
	MOVOA g_StepConstants<>+1568(SB), X10
	MOVOA g_StepConstants<>+1584(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1600(SB), X8
	MOVOA g_StepConstants<>+1616(SB), X9
	MOVOA g_StepConstants<>+1632(SB), X10
	MOVOA g_StepConstants<>+1648(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1664(SB), X8
	MOVOA g_StepConstants<>+1680(SB), X9
	MOVOA g_StepConstants<>+1696(SB), X10
	MOVOA g_StepConstants<>+1712(SB), X11

	// mix_even
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_even_alpha
	MOVOA X0, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x17, X12
	PSRLQ $0x29, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_even_beta
	MOVOA X4, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x3b, X8
	PSRLQ $0x05, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_odd
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X0
	MOVOU 144(SP), X1
	MOVOU 160(SP), X2
	MOVOU 176(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X4
	MOVOU 16(SP), X5
	MOVOU 32(SP), X6
	MOVOU 48(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 128(SP)
	MOVOU X1, 144(SP)
	MOVOU X2, 160(SP)
	MOVOU X3, 176(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X0
	MOVOU 208(SP), X1
	MOVOU 224(SP), X2
	MOVOU 240(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X4
	MOVOU 80(SP), X5
	MOVOU 96(SP), X6
	MOVOU 112(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 192(SP)
	MOVOU X1, 208(SP)
	MOVOU X2, 224(SP)
	MOVOU X3, 240(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_odd
	// load_blk_mem2vec
	MOVOU 128(SP), X8
	MOVOU 144(SP), X9
	MOVOU 160(SP), X10
	MOVOU 176(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 192(SP), X8
	MOVOU 208(SP), X9
	MOVOU 224(SP), X10
	MOVOU 240(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// load_sc
	// load_blk_mem2vec
	MOVOA g_StepConstants<>+1728(SB), X8
	MOVOA g_StepConstants<>+1744(SB), X9
	MOVOA g_StepConstants<>+1760(SB), X10
	MOVOA g_StepConstants<>+1776(SB), X11

	// mix_odd
	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_blk_odd_alpha
	MOVOA X0, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X0
	POR   X12, X0
	MOVOA X1, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X1
	POR   X12, X1
	MOVOA X2, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X2
	POR   X12, X2
	MOVOA X3, X12
	PSLLQ $0x07, X12
	PSRLQ $0x39, X3
	POR   X12, X3
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// add_blk
	PADDQ X0, X4
	PADDQ X1, X5
	PADDQ X2, X6
	PADDQ X3, X7

	// rotate_blk_odd_beta
	MOVOA X4, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X4
	POR   X8, X4
	MOVOA X5, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X5
	POR   X8, X5
	MOVOA X6, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X6
	POR   X8, X6
	MOVOA X7, X8
	PSLLQ $0x03, X8
	PSRLQ $0x3d, X7
	POR   X8, X7

	// add_blk
	PADDQ X4, X0
	PADDQ X5, X1
	PADDQ X6, X2
	PADDQ X7, X3

	// rotate_msg_gamma
	PSHUFB g_BytePermInfo_ssse3<>+0(SB), X4
	PSHUFB g_BytePermInfo_ssse3<>+16(SB), X5
	PSHUFB g_BytePermInfo_ssse3<>+32(SB), X6
	PSHUFB g_BytePermInfo_ssse3<>+48(SB), X7

	// word_perm
	MOVOA      X0, X8
	MOVOA      X0, X9
	MOVOA      X1, X0
	PUNPCKLQDQ X9, X0
	MOVOA      X1, X9
	MOVOA      X8, X1
	PUNPCKHQDQ X9, X1
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PSHUFD     $0x4e, X5, X5
	MOVOA      X4, X8
	PUNPCKLQDQ X5, X4
	PUNPCKHQDQ X8, X5
	PSHUFD     $0x4e, X7, X7
	MOVOA      X6, X8
	PUNPCKLQDQ X7, X6
	PUNPCKHQDQ X8, X7
	MOVOA      X0, X8
	MOVOA      X1, X9
	MOVOA      X2, X0
	MOVOA      X3, X1
	MOVOA      X6, X2
	MOVOA      X7, X3
	MOVOA      X4, X6
	MOVOA      X5, X7
	MOVOA      X8, X4
	MOVOA      X9, X5

	// save___start
	// store_blk
	MOVOU X0, 16(AX)
	MOVOU X1, 32(AX)
	MOVOU X2, 48(AX)
	MOVOU X3, 64(AX)

	// store_blk
	MOVOU X4, 80(AX)
	MOVOU X5, 96(AX)
	MOVOU X6, 112(AX)
	MOVOU X7, 128(AX)

	// save___end
	// msg_exp_even
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU (SP), X0
	MOVOU 16(SP), X1
	MOVOU 32(SP), X2
	MOVOU 48(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 128(SP), X4
	MOVOU 144(SP), X5
	MOVOU 160(SP), X6
	MOVOU 176(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, (SP)
	MOVOU X1, 16(SP)
	MOVOU X2, 32(SP)
	MOVOU X3, 48(SP)

	// i_state_save___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 64(SP), X0
	MOVOU 80(SP), X1
	MOVOU 96(SP), X2
	MOVOU 112(SP), X3

	// i_state_load___end
	// i_state_load___start
	// load_blk_mem2vec
	MOVOU 192(SP), X4
	MOVOU 208(SP), X5
	MOVOU 224(SP), X6
	MOVOU 240(SP), X7

	// i_state_load___end
	PSHUFD     $0x4e, X1, X1
	MOVOA      X0, X8
	MOVOA      X1, X0
	MOVOA      X8, X1
	PSHUFD     $0x4e, X3, X3
	MOVOA      X2, X8
	MOVOA      X2, X9
	MOVOA      X3, X2
	PUNPCKLQDQ X9, X2
	MOVOA      X3, X9
	MOVOA      X8, X3
	PUNPCKHQDQ X9, X3
	PADDQ      X4, X0
	PADDQ      X5, X1
	PADDQ      X6, X2
	PADDQ      X7, X3

	// i_state_save___start
	// store_blk
	MOVOU X0, 64(SP)
	MOVOU X1, 80(SP)
	MOVOU X2, 96(SP)
	MOVOU X3, 112(SP)

	// i_state_save___end
	// load___start
	// load_blk_mem2vec
	MOVOU 16(AX), X0
	MOVOU 32(AX), X1
	MOVOU 48(AX), X2
	MOVOU 64(AX), X3

	// load_blk_mem2vec
	MOVOU 80(AX), X4
	MOVOU 96(AX), X5
	MOVOU 112(AX), X6
	MOVOU 128(AX), X7

	// load___end
	// msg_add_even
	// load_blk_mem2vec
	MOVOU (SP), X8
	MOVOU 16(SP), X9
	MOVOU 32(SP), X10
	MOVOU 48(SP), X11
	PXOR  X8, X0
	PXOR  X9, X1
	PXOR  X10, X2
	PXOR  X11, X3

	// load_blk_mem2vec
	MOVOU 64(SP), X8
	MOVOU 80(SP), X9
	MOVOU 96(SP), X10
	MOVOU 112(SP), X11
	PXOR  X8, X4
	PXOR  X9, X5
	PXOR  X10, X6
	PXOR  X11, X7

	// fin
	PXOR X4, X0
	PXOR X5, X1
	PXOR X6, X2
	PXOR X7, X3

	// get_hash
	// store_blk
	MOVOU X0, (DX)
	MOVOU X1, 16(DX)
	MOVOU X2, 32(DX)
	MOVOU X3, 48(DX)
	MOVQ  ctx+0(FP), AX
	MOVQ  CX, 8(AX)
	RET

// func lsh512InitAVX2(ctx *lsh512ContextAsmData)
// Requires: AVX
TEXT ·lsh512InitAVX2(SB), NOSPLIT, $0-8
	MOVQ ctx+0(FP), AX
	MOVL (AX), CX
	MOVQ 8(AX), DX

	// lsh512_avx2_init
	CMPL CX, $0x00000040
	JNE  lsh512_avx2_init_if0_end

	// init512
	// load_blk_mem2mem
	// MemcpyStatic
	VMOVDQA g_IV512<>+0(SB), Y0
	VMOVDQU Y0, 16(AX)
	VMOVDQA g_IV512<>+32(SB), Y0
	VMOVDQU Y0, 48(AX)

	// load_blk_mem2mem
	// MemcpyStatic
	VMOVDQA g_IV512<>+64(SB), Y0
	VMOVDQU Y0, 80(AX)
	VMOVDQA g_IV512<>+96(SB), Y0
	VMOVDQU Y0, 112(AX)
	JMP     lsh512_avx2_init_ret

lsh512_avx2_init_if0_end:
	CMPL CX, $0x00000030
	JNE  lsh512_avx2_init_if1_end

	// init384
	// load_blk_mem2mem
	// MemcpyStatic
	VMOVDQA g_IV384<>+0(SB), Y0
	VMOVDQU Y0, 16(AX)
	VMOVDQA g_IV384<>+32(SB), Y0
	VMOVDQU Y0, 48(AX)

	// load_blk_mem2mem
	// MemcpyStatic
	VMOVDQA g_IV384<>+64(SB), Y0
	VMOVDQU Y0, 80(AX)
	VMOVDQA g_IV384<>+96(SB), Y0
	VMOVDQU Y0, 112(AX)
	JMP     lsh512_avx2_init_ret

lsh512_avx2_init_if1_end:
	CMPL CX, $0x00000020
	JNE  lsh512_avx2_init_if2_end

	// init256
	// load_blk_mem2mem
	// MemcpyStatic
	VMOVDQA g_IV256<>+0(SB), Y0
	VMOVDQU Y0, 16(AX)
	VMOVDQA g_IV256<>+32(SB), Y0
	VMOVDQU Y0, 48(AX)

	// load_blk_mem2mem
	// MemcpyStatic
	VMOVDQA g_IV256<>+64(SB), Y0
	VMOVDQU Y0, 80(AX)
	VMOVDQA g_IV256<>+96(SB), Y0
	VMOVDQU Y0, 112(AX)
	JMP     lsh512_avx2_init_ret

lsh512_avx2_init_if2_end:
	// init224
	// load_blk_mem2mem
	// MemcpyStatic
	VMOVDQA g_IV224<>+0(SB), Y0
	VMOVDQU Y0, 16(AX)
	VMOVDQA g_IV224<>+32(SB), Y0
	VMOVDQU Y0, 48(AX)

	// load_blk_mem2mem
	// MemcpyStatic
	VMOVDQA g_IV224<>+64(SB), Y0
	VMOVDQU Y0, 80(AX)
	VMOVDQA g_IV224<>+96(SB), Y0
	VMOVDQU Y0, 112(AX)

lsh512_avx2_init_ret:
	MOVQ ctx+0(FP), AX
	MOVQ DX, 8(AX)
	RET

// func lsh512UpdateAVX2(ctx *lsh512ContextAsmData, data []byte)
// Requires: AVX, AVX2, SSE2
TEXT ·lsh512UpdateAVX2(SB), NOSPLIT, $0-32
	MOVQ ctx+0(FP), AX
	MOVL (AX), CX
	MOVQ 8(AX), CX
	MOVQ data_base+8(FP), DX
	MOVQ data_len+16(FP), BX

	// lsh512_avx2_update
	MOVQ CX, SI
	MOVQ BX, DI
	ADDQ SI, DI
	CMPQ DI, $0x00000100
	JGE  lsh512_avx2_update_if0_end

	// Memcpy
	LEAQ 144(AX)(SI*1), AX
	LEAQ (DX), DI
	MOVQ BX, R8

memcpy_7_sz16_start:
	CMPQ  R8, $0x00000010
	JL    memcpy_7_sz16_end
	MOVOU (DI), X0
	MOVOU X0, (AX)
	ADDQ  $0x00000010, DI
	ADDQ  $0x00000010, AX
	SUBQ  $0x00000010, R8
	JMP   memcpy_7_sz16_start

memcpy_7_sz16_end:
memcpy_7_sz8_start:
	CMPQ R8, $0x00000008
	JL   memcpy_7_sz8_end
	MOVQ (DI), DX
	MOVQ DX, (AX)
	ADDQ $0x00000008, DI
	ADDQ $0x00000008, AX
	SUBQ $0x00000008, R8
	JMP  memcpy_7_sz8_start

memcpy_7_sz8_end:
memcpy_7_sz4_start:
	CMPQ R8, $0x00000004
	JL   memcpy_7_sz4_end
	MOVL (DI), DX
	MOVL DX, (AX)
	ADDQ $0x00000004, DI
	ADDQ $0x00000004, AX
	SUBQ $0x00000004, R8
	JMP  memcpy_7_sz4_start

memcpy_7_sz4_end:
memcpy_7_sz2_start:
	CMPQ R8, $0x00000002
	JL   memcpy_7_sz2_end
	MOVW (DI), DX
	MOVW DX, (AX)
	ADDQ $0x00000002, DI
	ADDQ $0x00000002, AX
	SUBQ $0x00000002, R8
	JMP  memcpy_7_sz2_start

memcpy_7_sz2_end:
memcpy_7_sz1_start:
	CMPQ R8, $0x00000001
	JL   memcpy_7_sz1_end
	MOVB (DI), DL
	MOVB DL, (AX)
	ADDQ $0x00000001, DI
	ADDQ $0x00000001, AX
	SUBQ $0x00000001, R8
	JMP  memcpy_7_sz1_start

memcpy_7_sz1_end:
	ADDQ BX, CX
	ADDQ BX, SI
	JMP  lsh512_avx2_update_ret

lsh512_avx2_update_if0_end:
	// load_blk_mem2vec
	VMOVDQU 16(AX), Y0
	VMOVDQU 48(AX), Y1

	// load_blk_mem2vec
	VMOVDQU 80(AX), Y2
	VMOVDQU 112(AX), Y3
	CMPQ    SI, $0x00000000
	JE      lsh512_avx2_update_if1_end
	MOVQ    $0x00000100, CX
	SUBQ    SI, CX

	// Memcpy
	LEAQ 144(AX)(SI*1), DI
	LEAQ (DX), R8
	MOVQ CX, R9

memcpy_8_sz16_start:
	CMPQ  R9, $0x00000010
	JL    memcpy_8_sz16_end
	MOVOU (R8), X4
	MOVOU X4, (DI)
	ADDQ  $0x00000010, R8
	ADDQ  $0x00000010, DI
	SUBQ  $0x00000010, R9
	JMP   memcpy_8_sz16_start

memcpy_8_sz16_end:
memcpy_8_sz8_start:
	CMPQ R9, $0x00000008
	JL   memcpy_8_sz8_end
	MOVQ (R8), SI
	MOVQ SI, (DI)
	ADDQ $0x00000008, R8
	ADDQ $0x00000008, DI
	SUBQ $0x00000008, R9
	JMP  memcpy_8_sz8_start

memcpy_8_sz8_end:
memcpy_8_sz4_start:
	CMPQ R9, $0x00000004
	JL   memcpy_8_sz4_end
	MOVL (R8), SI
	MOVL SI, (DI)
	ADDQ $0x00000004, R8
	ADDQ $0x00000004, DI
	SUBQ $0x00000004, R9
	JMP  memcpy_8_sz4_start

memcpy_8_sz4_end:
memcpy_8_sz2_start:
	CMPQ R9, $0x00000002
	JL   memcpy_8_sz2_end
	MOVW (R8), SI
	MOVW SI, (DI)
	ADDQ $0x00000002, R8
	ADDQ $0x00000002, DI
	SUBQ $0x00000002, R9
	JMP  memcpy_8_sz2_start

memcpy_8_sz2_end:
memcpy_8_sz1_start:
	CMPQ R9, $0x00000001
	JL   memcpy_8_sz1_end
	MOVB (R8), SI
	MOVB SI, (DI)
	ADDQ $0x00000001, R8
	ADDQ $0x00000001, DI
	SUBQ $0x00000001, R9
	JMP  memcpy_8_sz1_start

memcpy_8_sz1_end:
	// compress
	// load_blk_mem2vec
	VMOVDQU 144(AX), Y6
	VMOVDQU 176(AX), Y7

	// load_blk_mem2vec
	VMOVDQU 208(AX), Y8
	VMOVDQU 240(AX), Y9

	// load_blk_mem2vec
	VMOVDQU 272(AX), Y10
	VMOVDQU 304(AX), Y11

	// load_blk_mem2vec
	VMOVDQU 336(AX), Y12
	VMOVDQU 368(AX), Y13

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+0(SB), Y4
	VMOVDQA g_StepConstants<>+32(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+64(SB), Y4
	VMOVDQA g_StepConstants<>+96(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+128(SB), Y4
	VMOVDQA g_StepConstants<>+160(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+192(SB), Y4
	VMOVDQA g_StepConstants<>+224(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+256(SB), Y4
	VMOVDQA g_StepConstants<>+288(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+320(SB), Y4
	VMOVDQA g_StepConstants<>+352(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+384(SB), Y4
	VMOVDQA g_StepConstants<>+416(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+448(SB), Y4
	VMOVDQA g_StepConstants<>+480(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+512(SB), Y4
	VMOVDQA g_StepConstants<>+544(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+576(SB), Y4
	VMOVDQA g_StepConstants<>+608(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+640(SB), Y4
	VMOVDQA g_StepConstants<>+672(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+704(SB), Y4
	VMOVDQA g_StepConstants<>+736(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+768(SB), Y4
	VMOVDQA g_StepConstants<>+800(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+832(SB), Y4
	VMOVDQA g_StepConstants<>+864(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+896(SB), Y4
	VMOVDQA g_StepConstants<>+928(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+960(SB), Y4
	VMOVDQA g_StepConstants<>+992(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1024(SB), Y4
	VMOVDQA g_StepConstants<>+1056(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1088(SB), Y4
	VMOVDQA g_StepConstants<>+1120(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1152(SB), Y4
	VMOVDQA g_StepConstants<>+1184(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1216(SB), Y4
	VMOVDQA g_StepConstants<>+1248(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1280(SB), Y4
	VMOVDQA g_StepConstants<>+1312(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1344(SB), Y4
	VMOVDQA g_StepConstants<>+1376(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1408(SB), Y4
	VMOVDQA g_StepConstants<>+1440(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1472(SB), Y4
	VMOVDQA g_StepConstants<>+1504(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1536(SB), Y4
	VMOVDQA g_StepConstants<>+1568(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1600(SB), Y4
	VMOVDQA g_StepConstants<>+1632(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1664(SB), Y4
	VMOVDQA g_StepConstants<>+1696(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1728(SB), Y4
	VMOVDQA g_StepConstants<>+1760(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3
	ADDQ  CX, DX
	SUBQ  CX, BX
	MOVQ  $0x00000000, SI
	MOVQ  $0x00000000, CX

lsh512_avx2_update_if1_end:
lsh512_avx2_update_while_start:
	CMPQ BX, $0x00000100
	JL   lsh512_avx2_update_while_end

	// compress
	// load_blk_mem2vec
	VMOVDQU (DX), Y6
	VMOVDQU 32(DX), Y7

	// load_blk_mem2vec
	VMOVDQU 64(DX), Y8
	VMOVDQU 96(DX), Y9

	// load_blk_mem2vec
	VMOVDQU 128(DX), Y10
	VMOVDQU 160(DX), Y11

	// load_blk_mem2vec
	VMOVDQU 192(DX), Y12
	VMOVDQU 224(DX), Y13

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+0(SB), Y4
	VMOVDQA g_StepConstants<>+32(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+64(SB), Y4
	VMOVDQA g_StepConstants<>+96(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+128(SB), Y4
	VMOVDQA g_StepConstants<>+160(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+192(SB), Y4
	VMOVDQA g_StepConstants<>+224(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+256(SB), Y4
	VMOVDQA g_StepConstants<>+288(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+320(SB), Y4
	VMOVDQA g_StepConstants<>+352(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+384(SB), Y4
	VMOVDQA g_StepConstants<>+416(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+448(SB), Y4
	VMOVDQA g_StepConstants<>+480(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+512(SB), Y4
	VMOVDQA g_StepConstants<>+544(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+576(SB), Y4
	VMOVDQA g_StepConstants<>+608(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+640(SB), Y4
	VMOVDQA g_StepConstants<>+672(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+704(SB), Y4
	VMOVDQA g_StepConstants<>+736(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+768(SB), Y4
	VMOVDQA g_StepConstants<>+800(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+832(SB), Y4
	VMOVDQA g_StepConstants<>+864(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+896(SB), Y4
	VMOVDQA g_StepConstants<>+928(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+960(SB), Y4
	VMOVDQA g_StepConstants<>+992(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1024(SB), Y4
	VMOVDQA g_StepConstants<>+1056(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1088(SB), Y4
	VMOVDQA g_StepConstants<>+1120(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1152(SB), Y4
	VMOVDQA g_StepConstants<>+1184(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1216(SB), Y4
	VMOVDQA g_StepConstants<>+1248(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1280(SB), Y4
	VMOVDQA g_StepConstants<>+1312(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1344(SB), Y4
	VMOVDQA g_StepConstants<>+1376(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1408(SB), Y4
	VMOVDQA g_StepConstants<>+1440(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1472(SB), Y4
	VMOVDQA g_StepConstants<>+1504(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1536(SB), Y4
	VMOVDQA g_StepConstants<>+1568(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1600(SB), Y4
	VMOVDQA g_StepConstants<>+1632(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1664(SB), Y4
	VMOVDQA g_StepConstants<>+1696(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1728(SB), Y4
	VMOVDQA g_StepConstants<>+1760(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3
	ADDQ  $0x00000100, DX
	SUBQ  $0x00000100, BX
	JMP   lsh512_avx2_update_while_start

lsh512_avx2_update_while_end:
	// store_blk
	VMOVDQU Y0, 16(AX)
	VMOVDQU Y1, 48(AX)

	// store_blk
	VMOVDQU Y2, 80(AX)
	VMOVDQU Y3, 112(AX)
	CMPQ    BX, $0x00000000
	JE      lsh512_avx2_update_if3_end

	// Memcpy
	LEAQ 144(AX), AX
	LEAQ (DX), DX
	MOVQ BX, SI

memcpy_9_sz16_start:
	CMPQ  SI, $0x00000010
	JL    memcpy_9_sz16_end
	MOVOU (DX), X0
	MOVOU X0, (AX)
	ADDQ  $0x00000010, DX
	ADDQ  $0x00000010, AX
	SUBQ  $0x00000010, SI
	JMP   memcpy_9_sz16_start

memcpy_9_sz16_end:
memcpy_9_sz8_start:
	CMPQ SI, $0x00000008
	JL   memcpy_9_sz8_end
	MOVQ (DX), CX
	MOVQ CX, (AX)
	ADDQ $0x00000008, DX
	ADDQ $0x00000008, AX
	SUBQ $0x00000008, SI
	JMP  memcpy_9_sz8_start

memcpy_9_sz8_end:
memcpy_9_sz4_start:
	CMPQ SI, $0x00000004
	JL   memcpy_9_sz4_end
	MOVL (DX), CX
	MOVL CX, (AX)
	ADDQ $0x00000004, DX
	ADDQ $0x00000004, AX
	SUBQ $0x00000004, SI
	JMP  memcpy_9_sz4_start

memcpy_9_sz4_end:
memcpy_9_sz2_start:
	CMPQ SI, $0x00000002
	JL   memcpy_9_sz2_end
	MOVW (DX), CX
	MOVW CX, (AX)
	ADDQ $0x00000002, DX
	ADDQ $0x00000002, AX
	SUBQ $0x00000002, SI
	JMP  memcpy_9_sz2_start

memcpy_9_sz2_end:
memcpy_9_sz1_start:
	CMPQ SI, $0x00000001
	JL   memcpy_9_sz1_end
	MOVB (DX), CL
	MOVB CL, (AX)
	ADDQ $0x00000001, DX
	ADDQ $0x00000001, AX
	SUBQ $0x00000001, SI
	JMP  memcpy_9_sz1_start

memcpy_9_sz1_end:
	MOVQ BX, CX

lsh512_avx2_update_if3_end:
lsh512_avx2_update_ret:
	MOVQ ctx+0(FP), AX
	MOVQ CX, 8(AX)
	RET

// func lsh512FinalAVX2(ctx *lsh512ContextAsmData, hashval []byte)
// Requires: AVX, AVX2, SSE2
TEXT ·lsh512FinalAVX2(SB), NOSPLIT, $0-32
	MOVQ ctx+0(FP), AX
	MOVL (AX), CX
	MOVQ 8(AX), CX
	MOVQ hashval_base+8(FP), DX

	// lsh512_avx2_final
	MOVQ CX, BX
	MOVB $0x80, 144(AX)(BX*1)
	MOVQ $0x000000ff, SI
	SUBQ BX, SI

	// memset
	LEAQ 145(AX)(BX*1), BX
	CMPQ SI, $0x00000010
	JL   memset_3_sz16_end
	MOVO memset_value_0<>+0(SB), X0

memset_3_sz16_start:
	MOVOU X0, (BX)
	SUBQ  $0x00000010, SI
	ADDQ  $0x00000010, BX
	CMPQ  SI, $0x00000010
	JL    memset_3_sz16_end
	JMP   memset_3_sz16_start

memset_3_sz16_end:
	CMPQ SI, $0x00000008
	JL   memset_3_sz8_end
	MOVQ memset_value_0<>+0(SB), DI

memset_3_sz8_start:
	MOVQ DI, (BX)
	SUBQ $0x00000008, SI
	ADDQ $0x00000008, BX
	CMPQ SI, $0x00000008
	JL   memset_3_sz8_end
	JMP  memset_3_sz8_start

memset_3_sz8_end:
	CMPQ SI, $0x00000004
	JL   memset_3_sz4_end
	MOVL memset_value_0<>+0(SB), DI

memset_3_sz4_start:
	MOVL DI, (BX)
	SUBQ $0x00000004, SI
	ADDQ $0x00000004, BX
	CMPQ SI, $0x00000004
	JL   memset_3_sz4_end
	JMP  memset_3_sz4_start

memset_3_sz4_end:
	CMPQ SI, $0x00000002
	JL   memset_3_sz2_end
	MOVW memset_value_0<>+0(SB), DI

memset_3_sz2_start:
	MOVW DI, (BX)
	SUBQ $0x00000002, SI
	ADDQ $0x00000002, BX
	CMPQ SI, $0x00000002
	JL   memset_3_sz2_end
	JMP  memset_3_sz2_start

memset_3_sz2_end:
memset_3_1_start:
	CMPQ SI, $0x00000000
	JE   memset_3_1_end
	MOVB $0x00, (BX)
	SUBQ $0x00000001, SI
	ADDQ $0x00000001, BX
	JMP  memset_3_1_start

memset_3_1_end:
	// load_blk_mem2vec
	VMOVDQU 16(AX), Y0
	VMOVDQU 48(AX), Y1

	// load_blk_mem2vec
	VMOVDQU 80(AX), Y2
	VMOVDQU 112(AX), Y3

	// compress
	// load_blk_mem2vec
	VMOVDQU 144(AX), Y6
	VMOVDQU 176(AX), Y7

	// load_blk_mem2vec
	VMOVDQU 208(AX), Y8
	VMOVDQU 240(AX), Y9

	// load_blk_mem2vec
	VMOVDQU 272(AX), Y10
	VMOVDQU 304(AX), Y11

	// load_blk_mem2vec
	VMOVDQU 336(AX), Y12
	VMOVDQU 368(AX), Y13

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+0(SB), Y4
	VMOVDQA g_StepConstants<>+32(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+64(SB), Y4
	VMOVDQA g_StepConstants<>+96(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+128(SB), Y4
	VMOVDQA g_StepConstants<>+160(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+192(SB), Y4
	VMOVDQA g_StepConstants<>+224(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+256(SB), Y4
	VMOVDQA g_StepConstants<>+288(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+320(SB), Y4
	VMOVDQA g_StepConstants<>+352(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+384(SB), Y4
	VMOVDQA g_StepConstants<>+416(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+448(SB), Y4
	VMOVDQA g_StepConstants<>+480(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+512(SB), Y4
	VMOVDQA g_StepConstants<>+544(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+576(SB), Y4
	VMOVDQA g_StepConstants<>+608(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+640(SB), Y4
	VMOVDQA g_StepConstants<>+672(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+704(SB), Y4
	VMOVDQA g_StepConstants<>+736(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+768(SB), Y4
	VMOVDQA g_StepConstants<>+800(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+832(SB), Y4
	VMOVDQA g_StepConstants<>+864(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+896(SB), Y4
	VMOVDQA g_StepConstants<>+928(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+960(SB), Y4
	VMOVDQA g_StepConstants<>+992(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1024(SB), Y4
	VMOVDQA g_StepConstants<>+1056(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1088(SB), Y4
	VMOVDQA g_StepConstants<>+1120(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1152(SB), Y4
	VMOVDQA g_StepConstants<>+1184(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1216(SB), Y4
	VMOVDQA g_StepConstants<>+1248(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1280(SB), Y4
	VMOVDQA g_StepConstants<>+1312(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1344(SB), Y4
	VMOVDQA g_StepConstants<>+1376(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1408(SB), Y4
	VMOVDQA g_StepConstants<>+1440(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1472(SB), Y4
	VMOVDQA g_StepConstants<>+1504(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1536(SB), Y4
	VMOVDQA g_StepConstants<>+1568(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1600(SB), Y4
	VMOVDQA g_StepConstants<>+1632(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1664(SB), Y4
	VMOVDQA g_StepConstants<>+1696(SB), Y5

	// mix_even
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_even_alpha
	VPSLLQ $0x17, Y0, Y14
	VPSRLQ $0x29, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x17, Y1, Y14
	VPSRLQ $0x29, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_even_beta
	VPSLLQ $0x3b, Y2, Y4
	VPSRLQ $0x05, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x3b, Y3, Y4
	VPSRLQ $0x05, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y10, Y10
	VPADDQ  Y10, Y6, Y10
	VPERMQ  $0x93, Y11, Y11
	VPADDQ  Y11, Y7, Y11
	VPERMQ  $0x4b, Y12, Y12
	VPADDQ  Y12, Y8, Y12
	VPERMQ  $0x93, Y13, Y13
	VPADDQ  Y13, Y9, Y13

	// msg_add_odd
	VPXOR Y10, Y0, Y0
	VPXOR Y11, Y1, Y1
	VPXOR Y12, Y2, Y2
	VPXOR Y13, Y3, Y3

	// load_sc
	// load_blk_mem2vec
	VMOVDQA g_StepConstants<>+1728(SB), Y4
	VMOVDQA g_StepConstants<>+1760(SB), Y5

	// mix_odd
	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_blk_odd_alpha
	VPSLLQ $0x07, Y0, Y14
	VPSRLQ $0x39, Y0, Y0
	VPOR   Y0, Y14, Y0
	VPSLLQ $0x07, Y1, Y14
	VPSRLQ $0x39, Y1, Y1
	VPOR   Y1, Y14, Y1
	VPXOR  Y4, Y0, Y0
	VPXOR  Y5, Y1, Y1

	// add_blk
	VPADDQ Y0, Y2, Y2
	VPADDQ Y1, Y3, Y3

	// rotate_blk_odd_beta
	VPSLLQ $0x03, Y2, Y4
	VPSRLQ $0x3d, Y2, Y2
	VPOR   Y2, Y4, Y2
	VPSLLQ $0x03, Y3, Y4
	VPSRLQ $0x3d, Y3, Y3
	VPOR   Y3, Y4, Y3

	// add_blk
	VPADDQ Y2, Y0, Y0
	VPADDQ Y3, Y1, Y1

	// rotate_msg_gamma
	VPSHUFB g_BytePermInfo_avx2<>+0(SB), Y2, Y2
	VPSHUFB g_BytePermInfo_avx2<>+32(SB), Y3, Y3

	// word_perm
	VPERMQ  $0xd2, Y0, Y0
	VPERMQ  $0xd2, Y1, Y1
	VPERMQ  $0x6c, Y2, Y2
	VPERMQ  $0x6c, Y3, Y3
	VMOVDQA Y0, Y4
	VMOVDQA Y2, Y5
	VMOVDQA Y1, Y0
	VMOVDQA Y3, Y1
	VMOVDQA Y4, Y2
	VMOVDQA Y5, Y3
	VPERMQ  $0x4b, Y6, Y6
	VPADDQ  Y6, Y10, Y6
	VPERMQ  $0x93, Y7, Y7
	VPADDQ  Y7, Y11, Y7
	VPERMQ  $0x4b, Y8, Y8
	VPADDQ  Y8, Y12, Y8
	VPERMQ  $0x93, Y9, Y9
	VPADDQ  Y9, Y13, Y9

	// msg_add_even
	VPXOR Y6, Y0, Y0
	VPXOR Y7, Y1, Y1
	VPXOR Y8, Y2, Y2
	VPXOR Y9, Y3, Y3

	// fin
	VPXOR Y2, Y0, Y0
	VPXOR Y3, Y1, Y1

	// get_hash
	VMOVDQU Y0, (DX)
	VMOVDQU Y1, 32(DX)
	MOVQ    ctx+0(FP), AX
	MOVQ    CX, 8(AX)
	RET
