On the Intel Sandy Bridge family, store-forwarding stalls can't pipeline with other store-forwarding stalls.  Even on independent addresses, they conflict for throughput. See Store forwarding by example, One more interesting experiment for Ivy Bridge, and Alex's answer for Coffee Lake (Skylake derivative).
But a failed (slow-path) store-forwarding doesn't seem to block successful store-forwarding.  Testing on Skylake (i7-6700k) I made a test loop that includes two dependency chains:
- store / wider-reload causing a store-forwarding failure.  With that alone, 15 cycles per iteration on that latency bottleneck.
 
- 3x normal store/reload: ~13.425 cycles per iteration with just that.
 
- with both dependency chains in parallel: 15 or 16 cycles per iteration, depending on placement of things.
 
;; nasm -felf64 testloop.asm
;; ld -o testloop testloop.o
;; taskset -c 3 perf stat -etask-clock:u,context-switches:u,cpu-migrations:u,page-faults:u,cycles:u,branches:u,instructions:u,uops_issued.any:u,uops_executed.thread:u,idq.dsb_uops:u -r1 ./testloop
default rel
%ifdef __YASM_VER__
    CPU Conroe AMD
    CPU Skylake AMD
%else
%use smartalign
alignmode p6, 64
%endif
global _start
_start:
    lea        rdi, [buf]
    mov     ebp, 100000000
align 64
.loop:
    mov [rdi+64], ecx
;    mov  rcx, [rdi+64]   ; reload here: 16c.  Or 16.8 if we *also* reload after the %rep block
%rep 3
    mov [rdi], eax
    mov eax, [rdi]
%endrep
    mov  rcx, [rdi+64]     ; reload here: 15c
    dec ebp
    jnz .loop
.end:
;;NASM-only, not YASM:   %if __BITS__ == 32
%ifidn __OUTPUT_FORMAT__, elf32
    mov eax,1
    xor ebx,ebx
    int 0x80     ; sys_exit(0) 32-bit ABI
%else
    xor edi,edi
    mov eax,231   ; __NR_exit_group  from /usr/include/asm/unistd_64.h
    syscall       ; sys_exit_group(0)
%endif
section .bss
align 4096
buf:    resb 4096
Performance results:
$ t=testloop; asm-link -dn "$t".asm && taskset -c 3 perf stat --all-user -etask-clock,context-switches,cpu-migrations,page-faults,cycles,instructions,uops_issued.any,uops_executed.thread,ld_blocks.store_forward,resource_stalls.sb -r2 ./"$t"
+ nasm -felf64 -Worphan-labels testloop.asm
+ ld -o testloop testloop.o
testloop:     file format elf64-x86-64
Disassembly of section .text:
0000000000401000 <_start>:
  401000:       48 8d 3d f9 0f 00 00    lea    rdi,[rip+0xff9]        # 402000 <__bss_start>
  401007:       bd 00 e1 f5 05          mov    ebp,0x5f5e100
  40100c:       0f 1f 84 00 00 00 00 00         nop    DWORD PTR [rax+rax*1+0x0]
  401014:       0f 1f 84 00 00 00 00 00         nop    DWORD PTR [rax+rax*1+0x0]
  40101c:       0f 1f 84 00 00 00 00 00         nop    DWORD PTR [rax+rax*1+0x0]
  401024:       0f 1f 84 00 00 00 00 00         nop    DWORD PTR [rax+rax*1+0x0]
  40102c:       0f 1f 84 00 00 00 00 00         nop    DWORD PTR [rax+rax*1+0x0]
  401034:       0f 1f 84 00 00 00 00 00         nop    DWORD PTR [rax+rax*1+0x0]
  40103c:       0f 1f 40 00             nop    DWORD PTR [rax+0x0]
0000000000401040 <_start.loop>:
  401040:       89 4f 40                mov    DWORD PTR [rdi+0x40],ecx
  401043:       89 07                   mov    DWORD PTR [rdi],eax
  401045:       8b 07                   mov    eax,DWORD PTR [rdi]
  401047:       89 07                   mov    DWORD PTR [rdi],eax
  401049:       8b 07                   mov    eax,DWORD PTR [rdi]
  40104b:       89 07                   mov    DWORD PTR [rdi],eax
  40104d:       8b 07                   mov    eax,DWORD PTR [rdi]
  40104f:       48 8b 4f 40             mov    rcx,QWORD PTR [rdi+0x40]
  401053:       ff cd                   dec    ebp
  401055:       75 e9                   jne    401040 <_start.loop>
0000000000401057 <_start.end>:
  401057:       31 ff                   xor    edi,edi
  401059:       b8 e7 00 00 00          mov    eax,0xe7
  40105e:       0f 05                   syscall
Performance counter stats for './testloop' (two runs):
            385.85 msec task-clock                #    0.999 CPUs utilized            ( +-  0.02% )
                 0      context-switches          #    0.000 /sec
                 0      cpu-migrations            #    0.000 /sec
                 2      page-faults               #    5.183 /sec
     1,503,701,305      cycles                    #    3.897 GHz                      ( +-  0.01% )
     1,000,000,130      instructions              #    0.67  instructions per cycle           ( +-  0.00% )
       900,084,383      uops_issued.any           #    2.333 G/sec                    ( +-  0.00% )
     1,300,091,135      uops_executed.thread      #    3.369 G/sec                    ( +-  0.00% )
        99,933,928      ld_blocks.store_forward   #  258.998 M/sec                    ( +-  0.02% )
       443,686,304      resource_stalls.sb        #    1.150 G/sec                    ( +-  4.87% )
          0.386139 +- 0.000119 seconds time elapsed  ( +-  0.03% )