sum_amd64.s 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:build gc && !purego
  5. // +build gc,!purego
  6. #include "textflag.h"
  7. #define POLY1305_ADD(msg, h0, h1, h2) \
  8. ADDQ 0(msg), h0; \
  9. ADCQ 8(msg), h1; \
  10. ADCQ $1, h2; \
  11. LEAQ 16(msg), msg
  12. #define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \
  13. MOVQ r0, AX; \
  14. MULQ h0; \
  15. MOVQ AX, t0; \
  16. MOVQ DX, t1; \
  17. MOVQ r0, AX; \
  18. MULQ h1; \
  19. ADDQ AX, t1; \
  20. ADCQ $0, DX; \
  21. MOVQ r0, t2; \
  22. IMULQ h2, t2; \
  23. ADDQ DX, t2; \
  24. \
  25. MOVQ r1, AX; \
  26. MULQ h0; \
  27. ADDQ AX, t1; \
  28. ADCQ $0, DX; \
  29. MOVQ DX, h0; \
  30. MOVQ r1, t3; \
  31. IMULQ h2, t3; \
  32. MOVQ r1, AX; \
  33. MULQ h1; \
  34. ADDQ AX, t2; \
  35. ADCQ DX, t3; \
  36. ADDQ h0, t2; \
  37. ADCQ $0, t3; \
  38. \
  39. MOVQ t0, h0; \
  40. MOVQ t1, h1; \
  41. MOVQ t2, h2; \
  42. ANDQ $3, h2; \
  43. MOVQ t2, t0; \
  44. ANDQ $0xFFFFFFFFFFFFFFFC, t0; \
  45. ADDQ t0, h0; \
  46. ADCQ t3, h1; \
  47. ADCQ $0, h2; \
  48. SHRQ $2, t3, t2; \
  49. SHRQ $2, t3; \
  50. ADDQ t2, h0; \
  51. ADCQ t3, h1; \
  52. ADCQ $0, h2
  53. // func update(state *[7]uint64, msg []byte)
  54. TEXT ·update(SB), $0-32
  55. MOVQ state+0(FP), DI
  56. MOVQ msg_base+8(FP), SI
  57. MOVQ msg_len+16(FP), R15
  58. MOVQ 0(DI), R8 // h0
  59. MOVQ 8(DI), R9 // h1
  60. MOVQ 16(DI), R10 // h2
  61. MOVQ 24(DI), R11 // r0
  62. MOVQ 32(DI), R12 // r1
  63. CMPQ R15, $16
  64. JB bytes_between_0_and_15
  65. loop:
  66. POLY1305_ADD(SI, R8, R9, R10)
  67. multiply:
  68. POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14)
  69. SUBQ $16, R15
  70. CMPQ R15, $16
  71. JAE loop
  72. bytes_between_0_and_15:
  73. TESTQ R15, R15
  74. JZ done
  75. MOVQ $1, BX
  76. XORQ CX, CX
  77. XORQ R13, R13
  78. ADDQ R15, SI
  79. flush_buffer:
  80. SHLQ $8, BX, CX
  81. SHLQ $8, BX
  82. MOVB -1(SI), R13
  83. XORQ R13, BX
  84. DECQ SI
  85. DECQ R15
  86. JNZ flush_buffer
  87. ADDQ BX, R8
  88. ADCQ CX, R9
  89. ADCQ $0, R10
  90. MOVQ $16, R15
  91. JMP multiply
  92. done:
  93. MOVQ R8, 0(DI)
  94. MOVQ R9, 8(DI)
  95. MOVQ R10, 16(DI)
  96. RET