chacha_s390x.s 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. // Copyright 2018 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build !gccgo,!appengine
  5. #include "go_asm.h"
  6. #include "textflag.h"
  7. // This is an implementation of the ChaCha20 encryption algorithm as
  8. // specified in RFC 7539. It uses vector instructions to compute
  9. // 4 keystream blocks in parallel (256 bytes) which are then XORed
  10. // with the bytes in the input slice.
  11. GLOBL ·constants<>(SB), RODATA|NOPTR, $32
  12. // BSWAP: swap bytes in each 4-byte element
  13. DATA ·constants<>+0x00(SB)/4, $0x03020100
  14. DATA ·constants<>+0x04(SB)/4, $0x07060504
  15. DATA ·constants<>+0x08(SB)/4, $0x0b0a0908
  16. DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c
  17. // J0: [j0, j1, j2, j3]
  18. DATA ·constants<>+0x10(SB)/4, $0x61707865
  19. DATA ·constants<>+0x14(SB)/4, $0x3320646e
  20. DATA ·constants<>+0x18(SB)/4, $0x79622d32
  21. DATA ·constants<>+0x1c(SB)/4, $0x6b206574
  22. #define BSWAP V5
  23. #define J0 V6
  24. #define KEY0 V7
  25. #define KEY1 V8
  26. #define NONCE V9
  27. #define CTR V10
  28. #define M0 V11
  29. #define M1 V12
  30. #define M2 V13
  31. #define M3 V14
  32. #define INC V15
  33. #define X0 V16
  34. #define X1 V17
  35. #define X2 V18
  36. #define X3 V19
  37. #define X4 V20
  38. #define X5 V21
  39. #define X6 V22
  40. #define X7 V23
  41. #define X8 V24
  42. #define X9 V25
  43. #define X10 V26
  44. #define X11 V27
  45. #define X12 V28
  46. #define X13 V29
  47. #define X14 V30
  48. #define X15 V31
  49. #define NUM_ROUNDS 20
  50. #define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \
  51. VAF a1, a0, a0 \
  52. VAF b1, b0, b0 \
  53. VAF c1, c0, c0 \
  54. VAF d1, d0, d0 \
  55. VX a0, a2, a2 \
  56. VX b0, b2, b2 \
  57. VX c0, c2, c2 \
  58. VX d0, d2, d2 \
  59. VERLLF $16, a2, a2 \
  60. VERLLF $16, b2, b2 \
  61. VERLLF $16, c2, c2 \
  62. VERLLF $16, d2, d2 \
  63. VAF a2, a3, a3 \
  64. VAF b2, b3, b3 \
  65. VAF c2, c3, c3 \
  66. VAF d2, d3, d3 \
  67. VX a3, a1, a1 \
  68. VX b3, b1, b1 \
  69. VX c3, c1, c1 \
  70. VX d3, d1, d1 \
  71. VERLLF $12, a1, a1 \
  72. VERLLF $12, b1, b1 \
  73. VERLLF $12, c1, c1 \
  74. VERLLF $12, d1, d1 \
  75. VAF a1, a0, a0 \
  76. VAF b1, b0, b0 \
  77. VAF c1, c0, c0 \
  78. VAF d1, d0, d0 \
  79. VX a0, a2, a2 \
  80. VX b0, b2, b2 \
  81. VX c0, c2, c2 \
  82. VX d0, d2, d2 \
  83. VERLLF $8, a2, a2 \
  84. VERLLF $8, b2, b2 \
  85. VERLLF $8, c2, c2 \
  86. VERLLF $8, d2, d2 \
  87. VAF a2, a3, a3 \
  88. VAF b2, b3, b3 \
  89. VAF c2, c3, c3 \
  90. VAF d2, d3, d3 \
  91. VX a3, a1, a1 \
  92. VX b3, b1, b1 \
  93. VX c3, c1, c1 \
  94. VX d3, d1, d1 \
  95. VERLLF $7, a1, a1 \
  96. VERLLF $7, b1, b1 \
  97. VERLLF $7, c1, c1 \
  98. VERLLF $7, d1, d1
  99. #define PERMUTE(mask, v0, v1, v2, v3) \
  100. VPERM v0, v0, mask, v0 \
  101. VPERM v1, v1, mask, v1 \
  102. VPERM v2, v2, mask, v2 \
  103. VPERM v3, v3, mask, v3
  104. #define ADDV(x, v0, v1, v2, v3) \
  105. VAF x, v0, v0 \
  106. VAF x, v1, v1 \
  107. VAF x, v2, v2 \
  108. VAF x, v3, v3
  109. #define XORV(off, dst, src, v0, v1, v2, v3) \
  110. VLM off(src), M0, M3 \
  111. PERMUTE(BSWAP, v0, v1, v2, v3) \
  112. VX v0, M0, M0 \
  113. VX v1, M1, M1 \
  114. VX v2, M2, M2 \
  115. VX v3, M3, M3 \
  116. VSTM M0, M3, off(dst)
  117. #define SHUFFLE(a, b, c, d, t, u, v, w) \
  118. VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]}
  119. VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]}
  120. VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]}
  121. VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]}
  122. VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]}
  123. VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]}
  124. VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]}
  125. VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]}
  126. // func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
  127. TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
  128. MOVD $·constants<>(SB), R1
  129. MOVD dst+0(FP), R2 // R2=&dst[0]
  130. LMG src+24(FP), R3, R4 // R3=&src[0] R4=len(src)
  131. MOVD key+48(FP), R5 // R5=key
  132. MOVD nonce+56(FP), R6 // R6=nonce
  133. MOVD counter+64(FP), R7 // R7=counter
  134. // load BSWAP and J0
  135. VLM (R1), BSWAP, J0
  136. // setup
  137. MOVD $95, R0
  138. VLM (R5), KEY0, KEY1
  139. VLL R0, (R6), NONCE
  140. VZERO M0
  141. VLEIB $7, $32, M0
  142. VSRLB M0, NONCE, NONCE
  143. // initialize counter values
  144. VLREPF (R7), CTR
  145. VZERO INC
  146. VLEIF $1, $1, INC
  147. VLEIF $2, $2, INC
  148. VLEIF $3, $3, INC
  149. VAF INC, CTR, CTR
  150. VREPIF $4, INC
  151. chacha:
  152. VREPF $0, J0, X0
  153. VREPF $1, J0, X1
  154. VREPF $2, J0, X2
  155. VREPF $3, J0, X3
  156. VREPF $0, KEY0, X4
  157. VREPF $1, KEY0, X5
  158. VREPF $2, KEY0, X6
  159. VREPF $3, KEY0, X7
  160. VREPF $0, KEY1, X8
  161. VREPF $1, KEY1, X9
  162. VREPF $2, KEY1, X10
  163. VREPF $3, KEY1, X11
  164. VLR CTR, X12
  165. VREPF $1, NONCE, X13
  166. VREPF $2, NONCE, X14
  167. VREPF $3, NONCE, X15
  168. MOVD $(NUM_ROUNDS/2), R1
  169. loop:
  170. ROUND4(X0, X4, X12, X8, X1, X5, X13, X9, X2, X6, X14, X10, X3, X7, X15, X11)
  171. ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8, X3, X4, X14, X9)
  172. ADD $-1, R1
  173. BNE loop
  174. // decrement length
  175. ADD $-256, R4
  176. // rearrange vectors
  177. SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3)
  178. ADDV(J0, X0, X1, X2, X3)
  179. SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3)
  180. ADDV(KEY0, X4, X5, X6, X7)
  181. SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3)
  182. ADDV(KEY1, X8, X9, X10, X11)
  183. VAF CTR, X12, X12
  184. SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3)
  185. ADDV(NONCE, X12, X13, X14, X15)
  186. // increment counters
  187. VAF INC, CTR, CTR
  188. // xor keystream with plaintext
  189. XORV(0*64, R2, R3, X0, X4, X8, X12)
  190. XORV(1*64, R2, R3, X1, X5, X9, X13)
  191. XORV(2*64, R2, R3, X2, X6, X10, X14)
  192. XORV(3*64, R2, R3, X3, X7, X11, X15)
  193. // increment pointers
  194. MOVD $256(R2), R2
  195. MOVD $256(R3), R3
  196. CMPBNE R4, $0, chacha
  197. VSTEF $0, CTR, (R7)
  198. RET