Here are some observations I made. I am going to show some Intel syntax assembly code I got from compiling this program. I am using Compiler Explorer. To understand what follows you don't have to know a lot of assembly, the most important element here is size, the bigger it is the slower it is. If I could I would make this post smaller, but the code generated is surprisingly huge, and I don't know enough about Go to know what is useless. If you want to see what each statement converts to in assembly the Compiler Explorer will highlight everything for you.
TL;DR:
The Go compiler is a disastrous mess in my opinion, the C++ code get's very well optimized, and Java is small compared to Go. JIT'ing may have a major effect on the Java code, it may also be too complex for a loop unraveling inline optimization (precompute the value of count).
The Go code compiled down to this monstrosity:
text    "".main(SB), $224-0
movq    (TLS), CX
leaq    -96(SP), AX
cmpq    AX, 16(CX)
jls     835
subq    $224, SP
movq    BP, 216(SP)
leaq    216(SP), BP
funcdata        $0, gclocals·f6bd6b3389b872033d462029172c8612(SB)
funcdata        $1, gclocals·17283ea8379a997487dd6f8baf7ae6ea(SB)
pcdata  $0, $0
call    time.Now(SB)
movq    16(SP), AX
movq    8(SP), CX
movq    (SP), DX
movq    DX, time.t·2+160(SP)
movq    CX, time.t·2+168(SP)
movq    AX, time.t·2+176(SP)
movq    time.t·2+160(SP), AX
movq    AX, CX
shrq    $63, AX
shlq    $63, AX
testq   $-1, AX
jeq     806
movq    CX, DX
shlq    $1, CX
shrq    $31, CX
movq    $59453308800, BX
addq    BX, CX
andq    $1073741823, DX
movlqsx DX, DX
imulq   $1000000000, CX
addq    DX, CX
movq    $-6795364578871345152, DX
addq    DX, CX
movq    $4835703278458516699, AX
imulq   CX
sarq    $63, CX
sarq    $18, DX
subq    CX, DX
movq    DX, "".start+72(SP)
xorl    AX, AX
movq    AX, CX
jmp     257
incq    CX
incq    AX
cmpq    CX, $2000000000
jlt     213
movq    "".i+80(SP), SI
incq    SI
movq    "".start+72(SP), DX
movq    $59453308800, BX
movq    AX, CX
movq    SI, AX
movq    CX, "".count+88(SP)
cmpq    AX, $10
jge     404
movq    AX, "".i+80(SP)
movq    AX, ""..autotmp_24+112(SP)
xorps   X0, X0
movups  X0, ""..autotmp_23+120(SP)
leaq    type.int(SB), CX
movq    CX, (SP)
leaq    ""..autotmp_24+112(SP), DX
movq    DX, 8(SP)
pcdata  $0, $1
call    runtime.convT2E64(SB)
movq    24(SP), AX
movq    16(SP), CX
movq    CX, ""..autotmp_23+120(SP)
movq    AX, ""..autotmp_23+128(SP)
leaq    go.string."On step %d\n"(SB), AX
movq    AX, (SP)
movq    $11, 8(SP)
leaq    ""..autotmp_23+120(SP), CX
movq    CX, 16(SP)
movq    $1, 24(SP)
movq    $1, 32(SP)
pcdata  $0, $1
call    fmt.Printf(SB)
movq    "".count+88(SP), AX
xorl    CX, CX
jmp     219
pcdata  $0, $2
call    time.Now(SB)
movq    16(SP), AX
movq    8(SP), CX
movq    (SP), DX
movq    DX, time.t·2+136(SP)
movq    CX, time.t·2+144(SP)
movq    AX, time.t·2+152(SP)
movq    time.t·2+136(SP), AX
movq    AX, CX
shrq    $63, AX
shlq    $63, AX
testq   $-1, AX
jeq     787
movq    CX, DX
shlq    $1, CX
shrq    $31, CX
movq    $59453308800, BX
addq    BX, CX
imulq   $1000000000, CX
andq    $1073741823, DX
movlqsx DX, DX
addq    DX, CX
movq    $-6795364578871345152, DX
leaq    (DX)(CX*1), AX
movq    AX, "".~R0+64(SP)
movq    $4835703278458516699, CX
imulq   CX
sarq    $18, DX
movq    "".~R0+64(SP), CX
sarq    $63, CX
subq    CX, DX
movq    "".start+72(SP), CX
subq    CX, DX
movq    DX, ""..autotmp_29+104(SP)
movq    "".count+88(SP), CX
movq    CX, ""..autotmp_30+96(SP)
xorps   X0, X0
movups  X0, ""..autotmp_28+184(SP)
movups  X0, ""..autotmp_28+200(SP)
leaq    type.int64(SB), CX
movq    CX, (SP)
leaq    ""..autotmp_29+104(SP), CX
movq    CX, 8(SP)
pcdata  $0, $3
call    runtime.convT2E64(SB)
movq    16(SP), CX
movq    24(SP), DX
movq    CX, ""..autotmp_28+184(SP)
movq    DX, ""..autotmp_28+192(SP)
leaq    type.int(SB), CX
movq    CX, (SP)
leaq    ""..autotmp_30+96(SP), CX
movq    CX, 8(SP)
pcdata  $0, $3
call    runtime.convT2E64(SB)
movq    24(SP), CX
movq    16(SP), DX
movq    DX, ""..autotmp_28+200(SP)
movq    CX, ""..autotmp_28+208(SP)
leaq    go.string."Total time took: %d to get at count: %d\n"(SB), CX
movq    CX, (SP)
movq    $40, 8(SP)
leaq    ""..autotmp_28+184(SP), CX
movq    CX, 16(SP)
movq    $2, 24(SP)
movq    $2, 32(SP)
pcdata  $0, $3
call    fmt.Printf(SB)
movq    216(SP), BP
addq    $224, SP
ret
movq    time.t·2+144(SP), BX
movq    CX, DX
movq    BX, CX
jmp     501
movq    time.t·2+168(SP), SI
movq    CX, DX
movq    $59453308800, BX
movq    SI, CX
jmp     144
nop
pcdata  $0, $-1
call    runtime.morestack_noctxt(SB)
jmp     0
text    "".init(SB), $8-0
movq    (TLS), CX
cmpq    SP, 16(CX)
jls     89
subq    $8, SP
movq    BP, (SP)
leaq    (SP), BP
funcdata        $0, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
funcdata        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
movblzx "".initdone·(SB), AX
cmpb    AL, $1
jls     47
movq    (SP), BP
addq    $8, SP
ret
jne     56
pcdata  $0, $0
call    runtime.throwinit(SB)
undef
movb    $1, "".initdone·(SB)
pcdata  $0, $0
call    fmt.init(SB)
pcdata  $0, $0
call    time.init(SB)
movb    $2, "".initdone·(SB)
movq    (SP), BP
addq    $8, SP
ret
nop
pcdata  $0, $-1
call    runtime.morestack_noctxt(SB)
jmp     0
text    type..hash.[2]interface {}(SB), DUPOK, $40-24
movq    (TLS), CX
cmpq    SP, 16(CX)
jls     103
subq    $40, SP
movq    BP, 32(SP)
leaq    32(SP), BP
funcdata        $0, gclocals·d4dc2f11db048877dbc0f60a22b4adb3(SB)
funcdata        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
xorl    AX, AX
movq    "".h+56(SP), CX
jmp     82
movq    AX, "".i+24(SP)
shlq    $4, AX
movq    "".p+48(SP), BX
addq    BX, AX
movq    AX, (SP)
movq    CX, 8(SP)
pcdata  $0, $0
call    runtime.nilinterhash(SB)
movq    16(SP), CX
movq    "".i+24(SP), AX
incq    AX
cmpq    AX, $2
jlt     38
movq    CX, "".~r2+64(SP)
movq    32(SP), BP
addq    $40, SP
ret
nop
pcdata  $0, $-1
call    runtime.morestack_noctxt(SB)
jmp     0
text    type..eq.[2]interface {}(SB), DUPOK, $48-24
movq    (TLS), CX
cmpq    SP, 16(CX)
jls     155
subq    $48, SP
movq    BP, 40(SP)
leaq    40(SP), BP
funcdata        $0, gclocals·8f9cec06d1ae35cc9900c511c5e4bdab(SB)
funcdata        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
xorl    AX, AX
jmp     46
movq    ""..autotmp_8+32(SP), CX
leaq    1(CX), AX
cmpq    AX, $2
jge     140
movq    AX, CX
shlq    $4, AX
movq    "".p+56(SP), DX
movq    8(AX)(DX*1), BX
movq    (AX)(DX*1), SI
movq    "".q+64(SP), DI
movq    8(AX)(DI*1), R8
movq    (AX)(DI*1), AX
cmpq    SI, AX
jne     125
movq    CX, ""..autotmp_8+32(SP)
movq    SI, (SP)
movq    BX, 8(SP)
movq    R8, 16(SP)
pcdata  $0, $0
call    runtime.efaceeq(SB)
movblzx 24(SP), AX
testb   AL, AL
jne     37
movb    $0, "".~r2+72(SP)
movq    40(SP), BP
addq    $48, SP
ret
movb    $1, "".~r2+72(SP)
movq    40(SP), BP
addq    $48, SP
ret
nop
pcdata  $0, $-1
call    runtime.morestack_noctxt(SB)
jmp     0
I have no clue what most of that is doing. I can only hope that most of that is some sort of GC code. I looked up how to enable optimizations for the Go compiler, and all I could find was how to disable optimizations.
Comparatively I looked at a similar function in C++
#include <cstdio>
#include <chrono>
#include <cinttypes>
using namespace std::chrono;
milliseconds getMS()
{
    return duration_cast< milliseconds >(
        system_clock::now().time_since_epoch()
    );
}
int main()
{
    int count = 0;
    milliseconds millis = getMS();
    for(int i = 0; i < 10; ++i)
    {
        printf("On step %d\n", i);
        for(int j = 0; j < 2000000000; ++j)
        {
            ++count;
        }
    }
    milliseconds time = getMS() - millis;
    printf("Total time took: %" PRId64 " to get at count: %d\n", time.count(), count);
}
Which without optimizations compiled down to (compiler x86-64 clang (trunk (probably 6.0.0), flags: -std=c++0x -O0):
main:                                   # @main
        push    rbp
        mov     rbp, rsp
        sub     rsp, 48
        mov     dword ptr [rbp - 4], 0
        mov     dword ptr [rbp - 8], 0
        call    getMS()
        mov     qword ptr [rbp - 16], rax
        mov     dword ptr [rbp - 20], 0
.LBB3_1:                                # =>This Loop Header: Depth=1
        cmp     dword ptr [rbp - 20], 10
        jge     .LBB3_8
        mov     esi, dword ptr [rbp - 20]
        movabs  rdi, offset .L.str
        mov     al, 0
        call    printf
        mov     dword ptr [rbp - 24], 0
        mov     dword ptr [rbp - 44], eax # 4-byte Spill
.LBB3_3:                                #   Parent Loop BB3_1 Depth=1
        cmp     dword ptr [rbp - 24], 2000000000
        jge     .LBB3_6
        mov     eax, dword ptr [rbp - 8]
        add     eax, 1
        mov     dword ptr [rbp - 8], eax
        mov     eax, dword ptr [rbp - 24]
        add     eax, 1
        mov     dword ptr [rbp - 24], eax
        jmp     .LBB3_3
.LBB3_6:                                #   in Loop: Header=BB3_1 Depth=1
        jmp     .LBB3_7
.LBB3_7:                                #   in Loop: Header=BB3_1 Depth=1
        mov     eax, dword ptr [rbp - 20]
        add     eax, 1
        mov     dword ptr [rbp - 20], eax
        jmp     .LBB3_1
.LBB3_8:
        call    getMS()
        mov     qword ptr [rbp - 40], rax
        lea     rdi, [rbp - 40]
        lea     rsi, [rbp - 16]
        call    std::common_type<std::chrono::duration<long, std::ratio<1l, 1000l> >, std::chrono::duration<long, std::ratio<1l, 1000l> > >::type std::chrono::operator-<long, std::ratio<1l, 1000l>, long, std::ratio<1l, 1000l> >(std::chrono::duration<long, std::ratio<1l, 1000l> > const&, std::chrono::duration<long, std::ratio<1l, 1000l> > const&)
        mov     qword ptr [rbp - 32], rax
        lea     rdi, [rbp - 32]
        call    std::chrono::duration<long, std::ratio<1l, 1000l> >::count() const
        mov     edx, dword ptr [rbp - 8]
        movabs  rdi, offset .L.str.1
        mov     rsi, rax
        mov     al, 0
        call    printf
        mov     edx, dword ptr [rbp - 4]
        mov     dword ptr [rbp - 48], eax # 4-byte Spill
        mov     eax, edx
        add     rsp, 48
        pop     rbp
        ret
.L.str:
        .asciz  "On step %d\n"
.L.str.1:
        .asciz  "Total time took: %ld to get at count: %d\n"
There is actually a lot more code than that, but it is just the chrono implementation, in the optimized code it is just a library function call. I've also removed the implementation of getMS because it is mostly a wrapper method.
With O1 (size) optimizations this turns into:
main:                                   # @main
        push    rbx
        sub     rsp, 32
        call    getMS()
        mov     qword ptr [rsp + 24], rax
        xor     ebx, ebx
.LBB3_1:                                # =>This Inner Loop Header: Depth=1
        mov     edi, offset .L.str
        xor     eax, eax
        mov     esi, ebx
        call    printf
        add     ebx, 1
        cmp     ebx, 10
        jne     .LBB3_1
        call    getMS()
        mov     qword ptr [rsp + 8], rax
        lea     rdi, [rsp + 8]
        lea     rsi, [rsp + 24]
        call    std::common_type<std::chrono::duration<long, std::ratio<1l, 1000l> >, std::chrono::duration<long, std::ratio<1l, 1000l> > >::type std::chrono::operator-<long, std::ratio<1l, 1000l>, long, std::ratio<1l, 1000l> >(std::chrono::duration<long, std::ratio<1l, 1000l> > const&, std::chrono::duration<long, std::ratio<1l, 1000l> > const&)
        mov     qword ptr [rsp + 16], rax
        lea     rdi, [rsp + 16]
        call    std::chrono::duration<long, std::ratio<1l, 1000l> >::count() const
        mov     rcx, rax
        mov     edi, offset .L.str.1
        mov     edx, -1474836480
        xor     eax, eax
        mov     rsi, rcx
        call    printf
        xor     eax, eax
        add     rsp, 32
        pop     rbx
        ret
.L.str:
        .asciz  "On step %d\n"
.L.str.1:
        .asciz  "Total time took: %ld to get at count: %d\n"
O2 (speed) and O3 (max) optimizations essentially boil down to an unrolled outer loop (which is only there for the print statements), and a precomputed count value.
This mostly shows the terrible code generated by Go and some of the optimizations that occur in C++. But none of this shows exactly what the Java bytecode contains or what that will get JIT'ed down to if it is run enough times. So here is the Java bytecode:
public static void countToTwentyBillion();
    Code:
0: lconst_0
1: lstore_0
2: invokestatic  #2
// Method java/lang/System.currentTimeMillis:()J
5: lstore_2
6: iconst_0
7: istore
4
9: iload
4
11: bipush
10
13: if_icmpge
68
16: getstatic
#3
// Field java/lang/System.out:Ljava/io/PrintStream;
19: new
#4
// class java/lang/StringBuilder
22: dup
23: invokespecial #5
// Method java/lang/StringBuilder.'<init>':()V
26: ldc
#6
// String On step
28: invokevirtual #7
// Method java/lang/StringBuilder.append:(Ljava/lang/String;)Ljava/lang/StringBuilder;
31: iload
4
33: invokevirtual #8
// Method java/lang/StringBuilder.append:(I)Ljava/lang/StringBuilder;
36: invokevirtual #9
// Method java/lang/StringBuilder.toString:()Ljava/lang/String;
39: invokevirtual #10
// Method java/io/PrintStream.println:(Ljava/lang/String;)V
42: iconst_0
43: istore
5
45: iload
5
47: ldc
#11
// int 2000000000
49: if_icmpge
62
52: lload_0
53: lconst_1
54: ladd
55: lstore_0
56: iinc
5, 1
59: goto
45
62: iinc
4, 1
65: goto
9
68: invokestatic  #2
// Method java/lang/System.currentTimeMillis:()J
71: lstore
4
73: getstatic
#3
// Field java/lang/System.out:Ljava/io/PrintStream;
76: new
#4
// class java/lang/StringBuilder
79: dup
80: invokespecial #5
// Method java/lang/StringBuilder.'<init>':()V
83: ldc
#12
// String Total time took:
85: invokevirtual #7
// Method java/lang/StringBuilder.append:(Ljava/lang/String;)Ljava/lang/StringBuilder;
88: lload
4
90: lload_2
91: lsub
92: invokevirtual #13
// Method java/lang/StringBuilder.append:(J)Ljava/lang/StringBuilder;
95: ldc
#14
// String  ms to get at count:
97: invokevirtual #7
// Method java/lang/StringBuilder.append:(Ljava/lang/String;)Ljava/lang/StringBuilder;
100: lload_0
101: invokevirtual #13
// Method java/lang/StringBuilder.append:(J)Ljava/lang/StringBuilder;
104: invokevirtual #9
// Method java/lang/StringBuilder.toString:()Ljava/lang/String;
107: invokevirtual #10
// Method java/io/PrintStream.println:(Ljava/lang/String;)V
110: return
Unfortunately at the moment I don't feel like compiling hsdis and JIT'ing the code, but it will probably end up looking like some of the C++ examples. From what I know about the JIT, it might be able to precompute the count value. But this code is a bit complex (in terms of looping) which may make it harder for a fast JIT optimization.