Using mukunda's example above, in clang 3.4 with -O2 the issue seems to be in the vectorization phase. The vectorized code jumps on entry to past the vectorized code:
br i1 true, label %middle.block, label %vector.ph
so count's value remains unchanged from its initialization.
*** IR Dump Before Combine redundant instructions ***
; Function Attrs: nounwind readnone ssp uwtable
define i32 @main() #0 {
entry:
  br i1 true, label %middle.block, label %vector.ph
vector.ph:                                        ; preds = %entry
  br label %vector.body
vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %vec.phi = phi <4 x i32> [ <i32 1, i32 0, i32 0, i32 0>, %vector.ph ], [ %4, %vector.body ]
  %vec.phi8 = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
  %induction7 = add <4 x i32> %broadcast.splat, <i32 4, i32 5, i32 6, i32 7>
  %0 = icmp ult <4 x i32> %induction, <i32 5, i32 5, i32 5, i32 5>
  %1 = icmp ult <4 x i32> %induction7, <i32 5, i32 5, i32 5, i32 5>
  %2 = zext <4 x i1> %0 to <4 x i32>
  %3 = zext <4 x i1> %1 to <4 x i32>
  %4 = add <4 x i32> %2, %vec.phi
  %5 = add <4 x i32> %3, %vec.phi8
  %6 = icmp eq <4 x i32> %induction, <i32 -1, i32 -1, i32 -1, i32 -1>
  %7 = icmp eq <4 x i32> %induction7, <i32 -1, i32 -1, i32 -1, i32 -1>
  %8 = add <4 x i32> %induction, <i32 1, i32 1, i32 1, i32 1>
  %9 = add <4 x i32> %induction7, <i32 1, i32 1, i32 1, i32 1>
  %index.next = add i32 %index, 8
  %10 = icmp eq i32 %index.next, 0
  br i1 %10, label %middle.block, label %vector.body, !llvm.loop !1
middle.block:                                     ; preds = %vector.body, %entry
  %resume.val = phi i32 [ 0, %entry ], [ 0, %vector.body ]
  %trunc.resume.val = phi i32 [ 0, %entry ], [ 0, %vector.body ]
  %rdx.vec.exit.phi = phi <4 x i32> [ <i32 1, i32 0, i32 0, i32 0>, %entry ], [ %4, %vector.body ]
  %rdx.vec.exit.phi9 = phi <4 x i32> [ zeroinitializer, %entry ], [ %5, %vector.body ]
  %bin.rdx = add <4 x i32> %rdx.vec.exit.phi9, %rdx.vec.exit.phi
  %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
  %bin.rdx10 = add <4 x i32> %bin.rdx, %rdx.shuf
  %rdx.shuf11 = shufflevector <4 x i32> %bin.rdx10, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
  %bin.rdx12 = add <4 x i32> %bin.rdx10, %rdx.shuf11
  %11 = extractelement <4 x i32> %bin.rdx12, i32 0
  %cmp.n = icmp eq i32 0, %resume.val
  br i1 %cmp.n, label %while.end, label %scalar.ph
scalar.ph:                                        ; preds = %middle.block
  br label %while.body
while.body:                                       ; preds = %while.body, %scalar.ph
  %i.0 = phi i32 [ %trunc.resume.val, %scalar.ph ], [ %inc, %while.body ]
  %count.0 = phi i32 [ %11, %scalar.ph ], [ %add.count.0, %while.body ]
  %cmp = icmp ult i32 %i.0, 5
  %add = zext i1 %cmp to i32
  %add.count.0 = add i32 %add, %count.0
  %cmp1 = icmp eq i32 %i.0, -1
  %inc = add i32 %i.0, 1
  br i1 %cmp1, label %while.end, label %while.body, !llvm.loop !4
while.end:                                        ; preds = %middle.block, %while.body
  %add.count.0.lcssa = phi i32 [ %add.count.0, %while.body ], [ %11, %middle.block ]
  ret i32 %add.count.0.lcssa
}
The optimizer later erases unreachable and non-effective code - which is almost the entire function body.