I copied the bootasm.S from https://github.com/jeffallen/xv6/blob/master/bootasm.S, 
#include "asm.h"
# Start the first CPU: switch to 32-bit protected mode, jump into C.
# The BIOS loads this code from the first sector of the hard disk into
# memory at physical address 0x7c00 and starts executing in real mode
# with %cs=0 %ip=7c00.
#define SEG_KCODE 1 // kernel code
#define SEG_KDATA 2 // kernel data+stack
#define CR0_PE 1 // protected mode enable bit
.code16 # Assemble for 16-bit mode
.globl start
start:
        cli # BIOS enabled interrupts; disable
        # Set up the important data segment registers (DS, ES, SS).
        xorw %ax,%ax # Segment number zero
        movw %ax,%ds # -> Data Segment
        movw %ax,%es # -> Extra Segment
        movw %ax,%ss # -> Stack Segment
        # Physical address line A20 is tied to zero so that the first PCs
        # with 2 MB would run software that assumed 1 MB. Undo that.
        seta20.1:
        inb $0x64,%al # Wait for not busy
        testb $0x2,%al
        jnz seta20.1
        movb $0xd1,%al # 0xd1 -> port 0x64
        outb %al,$0x64
        seta20.2:
        inb $0x64,%al # Wait for not busy
        testb $0x2,%al
        jnz seta20.2
        movb $0xdf,%al # 0xdf -> port 0x60
        outb %al,$0x60
        # Switch from real to protected mode. Use a bootstrap GDT that makes
        # virtual addresses map dierctly to physical addresses so that the
        # effective memory map doesn't change during the transition.
        lgdt gdtdesc
        movl %cr0, %eax
        orl $CR0_PE, %eax
        movl %eax, %cr0
        # Complete transition to 32-bit protected mode by using long jmp
        # to reload %cs and %eip. The segment registers are set up with no
        # translation, so that the mapping is still the identity mapping.
        ljmp $(SEG_KCODE<<3), $start32
.code32 # Tell assembler to generate 32-bit code now.
start32:
        # Set up the protected-mode data segment registers
        movw $(SEG_KDATA<<3), %ax # Our data segment selector
        movw %ax, %ds # -> DS: Data Segment
        movw %ax, %es # -> ES: Extra Segment
        movw %ax, %ss # -> SS: Stack Segment
        xor  %eax, %eax  # Zero segments not ready for use 
        movw %ax, %fs # -> FS
        movw %ax, %gs # -> GS
        ## sti TaoWang: It should NOT call STI here, since NO IDT is ready.
        # Set up the stack pointer and call into C.
        movl $start, %esp
        call bootmain
    spin:
        jmp spin
# Bootstrap GDT
.p2align 2 # force 4 byte alignment
gdt:
SEG_NULLASM # null seg
SEG_ASM(STA_X|STA_R, 0x0, 0xffffffff) # code seg
SEG_ASM(STA_W, 0x0, 0xffffffff) # data seg
gdtdesc:
.word (gdtdesc - gdt - 1) # sizeof(gdt) - 1
.long gdt # address gdt
.fill 510-(.-start)
.word 0xaa55
and change the bootmain.c as follows,
#include "types.h"
char    serial_buffer[256];
static void my_memcpy(void *dst, void *src, u32 length)
{
    u32 i = 0;
    for (i = 0; i < length; i ++) {
        *(char *)dst = *(char *)src;
    }
    if (serial_buffer[0] == 'A') {
        asm ("cli\nhlt\n");
    } else {
        asm ("vmcall");
    }
}
int bootmain(void)
{
    my_memcpy(serial_buffer, "Abcedife", 8);
    return 0;
}
void handle_page_fault(void)
{
    return;
}
After the code is built through the Makefile (I listed below), the code to load the output binary is here,
unsigned char tempbuf[0x400];
void file_load(char *vmfname)
{
    int    vmfd = -1;
    size_t cnt = 0, offset = 0;
    vmfd = open( vmfname, O_RDWR );
    if (vmfd < 0) {
        exit(2);
    }
    do {
        cnt = read(vmfd, tempbuf, sizeof(tempbuf));
        // initialize the virtual-machine registers
        memcpy((void *)(CODE_START + offset), tempbuf, cnt);
        offset += cnt;
    } while (cnt > 0);
    close(vmfd);
    printf("Loading %ld bytes of VM to run\n", offset);
}
To my surprise, the while loop does NOT execute at all.
Here is my linker.ld, and I run them in Linux 4.4.0.  
ENTRY(start);
SECTIONS
{
    . = 0x7C00;
    .text : AT(0x7C00)
    {
        _text = .;
        *(.text);
        _text_end = .;
    }
    .data :
    {
        _data = .;
        *(.bss);
        *(.bss*);
        *(.data);
        *(.rodata*);
        *(COMMON)
        _data_end = .;
    }
        PROVIDE(data = .);
        /* The data segment */
        .data : {
                *(.data)
        }
        PROVIDE(edata = .);
        .bss : {
                *(.bss)
        }
        PROVIDE(end = .);
        /DISCARD/ : {
                *(.eh_frame .note.GNU-stack)
        }
}
The Makefile,
all: test
OBJDUMP=objdump
OBJCOPY=objcopy
CFLAGS = -fno-pic -static -fno-builtin -fno-strict-aliasing -Wall -MD -ggdb -m32 -Werror -fno-omit-frame-pointer
CFLAGS += $(shell $(CC) -fno-stack-protector -E -x c /dev/null >/dev/null 2>&1 && echo -fno-stack-protector)
ASFLAGS = -m32 -gdwarf-2 -Wa,-divide
LDFLAGS += -m $(shell $(LD) -V | grep elf_i386 2>/dev/null)
guest: test_app.c
        $(CC) -g2 -Wall -Wextra -Werror $^ -o $@
        $(CC) $(CFLAGS) -fno-pic -nostdinc -I. -c bootasm.S
        $(CC) $(CFLAGS) -fno-pic -I. -c bootmain.c
        $(LD) $(LDFLAGS) -N -e start -Tlinker.ld -o bootblock.o bootasm.o bootmain.o
        $(OBJDUMP) -S bootblock.o > bootblock.asm
        $(OBJCOPY) -S -O binary -j .text bootblock.o bootblock.bin
clean:
        rm -f *.o
        rm -f *.d
        rm -f test
        rm -f *.bin
        rm -f bootblock.asm
I don't know why the constant string failed to be passed as the parameter or its content is all '0' ?
If I use an array of char, put the array name as the parameter to myfputs(chararray), it will work well.  
