mov ax,0x0001
and ax,dx
add ah,48
mov byte [HEX_OUT+5],ah
In the above snippet you only keep a single bit where you need to keep 4 bits.
You also do an addition on AH when the result definitely is in AL.
Because of how the ASCII set is organized, you can't just merily add 48 to convert into the hexadecimal. There is a gap between the encoding for '9' (57) and the encoding for 'A' (65). Your code needs to account for this!
For the least significant hex digit:
    mov ax, dx     ;Original number
    and al, 15     ;Keep 4 bits
    add al, '0'    ;Make text
    cmp al, '9'
    jbe .LessA     ;Already fine for '0' to '9'
    add al, 7      ;Bridge the gap to reach 'A' to 'F'
.LessA:
    mov [HEX_OUT + 5], al
For the next hexdigit this would become:
    mov ax, dx     ;Original number
    shr ax, 4
    and al, 15     ;Keep 4 bits
    add al, '0'    ;Make text
    cmp al, '9'
    jbe .LessA     ;Already fine for '0' to '9'
    add al, 7      ;Bridge the gap to reach 'A' to 'F'
.LessA:
    mov [HEX_OUT + 4], al
For the next hexdigit this would become:
    mov ax, dx     ;Original number
    shr ax, 8
    and al, 15     ;Keep 4 bits
    add al, '0'    ;Make text
    cmp al, '9'
    jbe .LessA     ;Already fine for '0' to '9'
    add al, 7      ;Bridge the gap to reach 'A' to 'F'
.LessA:
    mov [HEX_OUT + 3], al
For the next hexdigit this would become:
    mov ax, dx     ;Original number
    shr ax, 12
    and al, 15     ;Keep 4 bits
    add al, '0'    ;Make text
    cmp al, '9'
    jbe .LessA     ;Already fine for '0' to '9'
    add al, 7      ;Bridge the gap to reach 'A' to 'F'
.LessA:
    mov [HEX_OUT + 2], al
This rapidly got longer than is good for us, so using a loop will be much better.
Next solution will start from the high end but the end result will be no different.
    mov bx, 2      ;Position for most significant digit
.Next:
    ror dx, 4      ;Bring digit in lowest 4 bits
    mov al, dl     ;Copy number
    and al, 15     ;Keep 4 bits
    add al, '0'    ;Make text
    cmp al, '9'
    jbe .LessA     ;Already fine for '0' to '9'
    add al, 7      ;Bridge the gap to reach 'A' to 'F'
.LessA:
    mov [HEX_OUT + bx], al
    inc bx
    cmp bx, 6      ;Did we fill chars at +2 +3 +4 +5 ?
    jb  .Next      ;Not yet
Because there are 4 iterations in this loop and the number in DX is rotated 4x each time, DX will hold the original value in the end. No need to preserve it.  
jmp endi;
What's this supposed to achieve? This is jumping to data and that's certainly not executable code! If you want an endless loop then simply write:
jmp $
The other file, that you say is working with other modules, is a mess!!
Everybody keeps neglecting this, but the BIOS teletype function requires the BH register to have the desired display page. Therefore it's always a bad idea to use BX as the string pointer.
Here's a good solution that doesn't require you to change all of your existing code (concerning the use of BX):
print_string:
    pusha
    mov     si, bx
    mov     bh, 0      ;Display page 0
    ;mov     bl, 7      ;Color if this were a graphical screen
    cld                ;Required to use LODSB correctly
    jmp     .start
  .write:
    mov     ah, 0x0E   ;BIOS.Teletype
    int     0x10
  .start:
    lodsb              ;Increments the pointer automatically
    cmp     al, 0      ;Comparing for null
    jne     .write
    popa
    ret