I write a go assembly implementation using AVX2 instructions after two days of (go) assembly learning.
The performance is good, 10X of simple loop version. While optimizations for compatibility and performance are still needed. Suggestions and PRs are welcome.
Note: code and benchmark results are updated.
I appreciate @PeterCordes for many valuable suggestions.
#include "textflag.h"
// func AND(x []byte, y []byte)
// Requires: AVX
TEXT ·AND(SB), NOSPLIT|NOPTR, $0-48
    // pointer of x
    MOVQ x_base+0(FP), AX
    // length of x
    MOVQ x_len+8(FP), CX
    // pointer of y
    MOVQ y_base+24(FP), DX
    // --------------------------------------------
    // end address of x, will not change: p + n
    MOVQ AX, BX
    ADDQ CX, BX
    // end address for loop
    // n <= 8, jump to tail
    CMPQ CX, $0x00000008
    JLE  tail
    // n < 16, jump to loop8
    CMPQ CX, $0x00000010
    JL   loop8_start
    // n < 32, jump to loop16
    CMPQ CX, $0x00000020
    JL   loop16_start
    // --------------------------------------------
    // end address for loop32
    MOVQ BX, CX
    SUBQ $0x0000001f, CX
loop32:
    // compute x & y, and save value to x
    VMOVDQU (AX), Y0
    VANDPS  (DX), Y0, Y0
    VMOVDQU Y0, (AX)
    // move pointer
    ADDQ $0x00000020, AX
    ADDQ $0x00000020, DX
    CMPQ AX, CX
    JL   loop32
    // n <= 8, jump to tail
    MOVQ BX, CX
    SUBQ AX, CX
    CMPQ CX, $0x00000008
    JLE  tail
    // n < 16, jump to loop8
    CMPQ CX, $0x00000010
    JL   loop8_start
    // --------------------------------------------
loop16_start:
    // end address for loop16
    MOVQ BX, CX
    SUBQ $0x0000000f, CX
loop16:
    // compute x & y, and save value to x
    VMOVDQU (AX), X0
    VANDPS  (DX), X0, X0
    VMOVDQU X0, (AX)
    // move pointer
    ADDQ $0x00000010, AX
    ADDQ $0x00000010, DX
    CMPQ AX, CX
    JL   loop16
    // n <= 8, jump to tail
    MOVQ BX, CX
    SUBQ AX, CX
    CMPQ CX, $0x00000008
    JLE  tail
    // --------------------------------------------
loop8_start:
    // end address for loop8
    MOVQ BX, CX
    SUBQ $0x00000007, CX
loop8:
    // compute x & y, and save value to x
    MOVQ (AX), BX
    ANDQ (DX), BX
    MOVQ BX, (AX)
    // move pointer
    ADDQ $0x00000008, AX
    ADDQ $0x00000008, DX
    CMPQ AX, CX
    JL   loop8
    // --------------------------------------------
tail:
    // left elements (<=8)
    MOVQ (AX), BX
    ANDQ (DX), BX
    MOVQ BX, (AX)
    RET
Benchmark result:
test                       data-size        time
-------------------        ---------        -----------
BenchmarkGrailbio          8.00_B           4.654 ns/op
BenchmarkGoAsm             8.00_B           4.824 ns/op
BenchmarkUnrollLoop        8.00_B           6.851 ns/op
BenchmarkLoop              8.00_B           8.683 ns/op
BenchmarkGrailbio          16.00_B          5.363 ns/op
BenchmarkGoAsm             16.00_B          6.369 ns/op
BenchmarkUnrollLoop        16.00_B          10.47 ns/op
BenchmarkLoop              16.00_B          13.48 ns/op
BenchmarkGoAsm             32.00_B          6.079 ns/op
BenchmarkGrailbio          32.00_B          6.497 ns/op
BenchmarkUnrollLoop        32.00_B          17.46 ns/op
BenchmarkLoop              32.00_B          21.09 ns/op
BenchmarkGoAsm             128.00_B         10.52 ns/op
BenchmarkGrailbio          128.00_B         14.40 ns/op
BenchmarkUnrollLoop        128.00_B         56.97 ns/op
BenchmarkLoop              128.00_B         80.12 ns/op
BenchmarkGoAsm             256.00_B         15.48 ns/op
BenchmarkGrailbio          256.00_B         23.76 ns/op
BenchmarkUnrollLoop        256.00_B         110.8 ns/op
BenchmarkLoop              256.00_B         147.5 ns/op
BenchmarkGoAsm             1.00_KB          47.16 ns/op
BenchmarkGrailbio          1.00_KB          87.75 ns/op
BenchmarkUnrollLoop        1.00_KB          443.1 ns/op
BenchmarkLoop              1.00_KB          540.5 ns/op
BenchmarkGoAsm             16.00_KB         751.6 ns/op
BenchmarkGrailbio          16.00_KB         1342 ns/op
BenchmarkUnrollLoop        16.00_KB         7007 ns/op
BenchmarkLoop              16.00_KB         8623 ns/op