unsigned int fun0 ( unsigned int );
static unsigned int fun1 ( unsigned int x )
{
    return(x+1);
}
unsigned int fun2 ( unsigned int x )
{
    return(x+2);
}
inline unsigned int fun3 ( unsigned int x )
{
    return(x+3);
}
unsigned int hello ( unsigned int x )
{
    unsigned int y;
    y=fun0(x);
    y=fun1(y);
    y=fun2(y);
    y=fun3(y);
    return(y);
}
Intentionally using a different instruction set:
Disassembly of section .text:
00000000 <fun2>:
   0:   e2800002    add r0, r0, #2
   4:   e12fff1e    bx  lr
00000008 <hello>:
   8:   e92d4010    push    {r4, lr}
   c:   ebfffffe    bl  0 <fun0>
  10:   e8bd4010    pop {r4, lr}
  14:   e2800006    add r0, r0, #6
  18:   e12fff1e
fun0() is external the compiler doesnt have visibility there it has to setup a call and take the return value.
fun1() is marked as static so we have indicated we want that function to be local to this object/file/scope so there is no reason for the compiler to create a function there for others to remotely access, and the optimizer can see the function it is in the same file so chooses to inline it.
fun2() has no special markings it is assumed global so the compiler needs to provide code that performs that function for others to possibly consume, but at the same time the optimizer sees that function, it is in the same file, so optimizes it as inline as well as fun1.
fun3() we indicated the compiler can inline this one, somewhat implying that it is for consumption in this scope, so like static the compiler did not generate code for global consumption, and optimized (inlined)
functionally hello takes x sends it to fun0() which turns it into y.  we then add 1+2+3 = 6 to it.  So to inline fun1, fun2, fun3 you simply add 6 to the output of fun0().  And that is what we see fun1() fun2() and fun3() are inlined.
Maybe the confusion here is what inline means it means in line.  Dont call the funtion include the functionality in line with the caller.
unsigned int fun2 ( unsigned int x )
{
    return(x+2);
}
unsigned int hello ( unsigned int x )
{
    return(fun2(x));
}
with the tool I am using I didnt actually need to ask it to inline
00000000 <fun2>:
   0:   e2800002    add r0, r0, #2
   4:   e12fff1e    bx  lr
00000008 <hello>:
   8:   e2800002    add r0, r0, #2
   c:   e12fff1e    bx  lr
the optimizer did it anywa, instead of setting up a call to fun2 it took the functionality of fun2 which was to add 2 to the operand, and it simply did that in hello IN LINE.
With your tool notice the global function is created either way, but when you asked it to inline it doesnt look like it actually did anything, check the disassembly along with the assembly, the disassembly is usually easier to read, less confusing.
Note, using my first example and a C++ compiler so I dont get a "hey you didnt use a C++ compiler":
0000000000000000 <_Z4fun2j>:
   0:   8d 47 02                lea    0x2(%rdi),%eax
   3:   c3                      retq   
   4:   66 90                   xchg   %ax,%ax
   6:   66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)
   d:   00 00 00 
0000000000000010 <_Z5helloj>:
  10:   48 83 ec 08             sub    $0x8,%rsp
  14:   e8 00 00 00 00          callq  19 <_Z5helloj+0x9>
  19:   48 83 c4 08             add    $0x8,%rsp
  1d:   83 c0 06                add    $0x6,%eax
  20:   c3                      retq   
Same story, the inline and static did not produce a global function for others to use.  And the compiler generated a call for the external function, then added 6 to that.
Note no optimization:
00000000 <fun1>:
   0:   e52db004    push    {r11}       ; (str r11, [sp, #-4]!)
   4:   e28db000    add r11, sp, #0
   8:   e24dd00c    sub sp, sp, #12
   c:   e50b0008    str r0, [r11, #-8]
  10:   e51b3008    ldr r3, [r11, #-8]
  14:   e2833001    add r3, r3, #1
  18:   e1a00003    mov r0, r3
  1c:   e28bd000    add sp, r11, #0
  20:   e49db004    pop {r11}       ; (ldr r11, [sp], #4)
  24:   e12fff1e    bx  lr
00000028 <fun2>:
  28:   e52db004    push    {r11}       ; (str r11, [sp, #-4]!)
  2c:   e28db000    add r11, sp, #0
  30:   e24dd00c    sub sp, sp, #12
  34:   e50b0008    str r0, [r11, #-8]
  38:   e51b3008    ldr r3, [r11, #-8]
  3c:   e2833002    add r3, r3, #2
  40:   e1a00003    mov r0, r3
  44:   e28bd000    add sp, r11, #0
  48:   e49db004    pop {r11}       ; (ldr r11, [sp], #4)
  4c:   e12fff1e    bx  lr
00000050 <hello>:
  50:   e92d4800    push    {r11, lr}
  54:   e28db004    add r11, sp, #4
  58:   e24dd010    sub sp, sp, #16
  5c:   e50b0010    str r0, [r11, #-16]
  60:   e51b0010    ldr r0, [r11, #-16]
  64:   ebfffffe    bl  0 <fun0>
  68:   e50b0008    str r0, [r11, #-8]
  6c:   e51b0008    ldr r0, [r11, #-8]
  70:   ebffffe2    bl  0 <fun1>
  74:   e50b0008    str r0, [r11, #-8]
  78:   e51b0008    ldr r0, [r11, #-8]
  7c:   ebfffffe    bl  28 <fun2>
  80:   e50b0008    str r0, [r11, #-8]
  84:   e51b0008    ldr r0, [r11, #-8]
  88:   ebfffffe    bl  0 <fun3>
  8c:   e50b0008    str r0, [r11, #-8]
  90:   e51b3008    ldr r3, [r11, #-8]
  94:   e1a00003    mov r0, r3
  98:   e24bd004    sub sp, r11, #4
  9c:   e8bd4800    pop {r11, lr}
  a0:   e12fff1e    bx  lr
calls them all no inlining...what optimization did you use in your test?  What if you try optimizing? (llvm/clang gives you multiple optimization opportunities over gnu)
EDIT using llvm and optimization.
two separate files
unsigned int fun0 ( unsigned int x )
{
    return(x+7);
}
and this one
unsigned int fun0 ( unsigned int );
inline unsigned int fun3 ( unsigned int x )
{
    return(x+3);
}
unsigned int hello ( unsigned int x )
{
    unsigned int y;
    y=fun0(x);
    y=fun3(y);
    return(y);
}
build without optimization
0000000000000000 :
   0:   55                      push   %rbp
   1:   48 89 e5                mov    %rsp,%rbp
   4:   89 7d fc                mov    %edi,-0x4(%rbp)
   7:   8d 47 07                lea    0x7(%rdi),%eax
   a:   5d                      pop    %rbp
   b:   c3                      retq   
and
0000000000000000 <hello>:
   0:   55                      push   %rbp
   1:   48 89 e5                mov    %rsp,%rbp
   4:   48 83 ec 10             sub    $0x10,%rsp
   8:   89 7d fc                mov    %edi,-0x4(%rbp)
   b:   e8 00 00 00 00          callq  10 <hello+0x10>
  10:   89 45 f8                mov    %eax,-0x8(%rbp)
  13:   89 c7                   mov    %eax,%edi
  15:   e8 00 00 00 00          callq  1a <hello+0x1a>
  1a:   89 45 f8                mov    %eax,-0x8(%rbp)
  1d:   48 83 c4 10             add    $0x10,%rsp
  21:   5d                      pop    %rbp
  22:   c3                      retq   
post compile was hoping for fun0 to be inlined, oh well, it did optimize hello
0000000000000000 <fun0>:
   0:   55                      push   %rbp
   1:   48 89 e5                mov    %rsp,%rbp
   4:   8d 47 07                lea    0x7(%rdi),%eax
   7:   5d                      pop    %rbp
   8:   c3                      retq   
   9:   0f 1f 80 00 00 00 00    nopl   0x0(%rax)
0000000000000010 <hello>:
  10:   55                      push   %rbp
  11:   48 89 e5                mov    %rsp,%rbp
  14:   83 c7 07                add    $0x7,%edi
  17:   e8 00 00 00 00          callq  1c <hello+0xc>
  1c:   5d                      pop    %rbp
  1d:   c3                      retq   
compiled with optimizations.
0000000000000000 <fun0>:
   0:   8d 47 07                lea    0x7(%rdi),%eax
   3:   c3                      retq   
0000000000000000 <hello>:
   0:   50                      push   %rax
   1:   e8 00 00 00 00          callq  6 <hello+0x6>
   6:   83 c0 03                add    $0x3,%eax
   9:   59                      pop    %rcx
   a:   c3                      retq   
clang gives you different optimization opportunities.
Okay that got it, as your number of files increases the optimization combinations for llvm tools goes up near exponentially, for bigger projects I found if you compile unoptimized it gives the later optimizer more meat to work with, but of course it depends on a number of factors, and unfortunately the combinations become staggering.  If I compile with optimizations first then combine and optimize later I get what I wanted.
0000000000000000 <fun0>:
   0:   8d 47 07                lea    0x7(%rdi),%eax
   3:   c3                      retq   
0000000000000010 <hello>:
  10:   8d 47 0a                lea    0xa(%rdi),%eax
  13:   c3                      retq   
fun3 added 3 fun0 added 7, the call to fun0 was inlined and I end up from two files one external function one internal inlined, just add 10.
I used C here but llvm/clang like gnu thats just a front end, what happens in the middle as shown above with gnu should behave the same independent of C and C++ (as far as optimization doing automatic or suggested inlining).