Here is another implementation (much faster than others):
void ConstMult4(uint8_t *V, size_t N, uint8_t digit)
{
    uint8_t CARRY = 0;
    const uint32_t coef7  = digit * 10000000;
    const uint32_t coef6  = digit * 1000000;
    const uint32_t coef5  = digit * 100000;
    const uint32_t coef4  = digit * 10000;
    const uint32_t coef3  = digit * 1000;
    const uint32_t coef2  = digit * 100;
    const uint32_t coef1  = digit * 10;
    const uint32_t coef0  = digit;
    static uint8_t table[10000][4];
    static int init = 1;
    if(init)
    {
        for(int i=0 ; i<10000 ; ++i)
        {
            table[i][0] = (i / 1) % 10;
            table[i][1] = (i / 10) % 10;
            table[i][2] = (i / 100) % 10;
            table[i][3] = (i / 1000) % 10;
        }
        init = 0;
    }
    for(size_t i=0 ; i<N/8*8 ; i+=8)
    {
        const uint32_t val = V[i+7]*coef7 + V[i+6]*coef6 + V[i+5]*coef5 + V[i+4]*coef4 + V[i+3]*coef3 + V[i+2]*coef2 + V[i+1]*coef1 + V[i+0]*coef0 + CARRY;
        CARRY = val / 100000000;
        const uint32_t loVal = val % 10000;
        const uint32_t hiVal = val / 10000 - CARRY * 10000;
        const uint8_t* loTablePtr = &table[loVal][0];
        const uint8_t* hiTablePtr = &table[hiVal][0];
        // Assume the compiler optimize the 2 following calls
        // (otherwise the performance could be quite bad).
        // memcpy is used to prevent performance issue due to pointer aliasing. 
        memcpy(V+i, loTablePtr, 4);
        memcpy(V+i+4, hiTablePtr, 4);
    }
    for(size_t i=N/8*8 ; i<N ; ++i)
    {
        V[i] = V[i] * digit + CARRY;
        CARRY = V[i] / 10;
        V[i] -= CARRY * 10;
    }
}
This implementation assumes that computed numbers in V and digit are actually digits. 
It is significantly faster than other methods by:
- working with a bigger base internally as proposed by @phuclv (it reduces the critical path and introduce more parallelism);
 
- using a lookup table as proposed by @chqrlieforyellowblockquotes (it enable the very fast computation of division/modulus operations).
 
This code can even be improved by using SSE 4.1 intrinsics (SIMD instructions). But at the cost of a less portable code (although it will work on most modern x86_64-based processors). Here is the implementation:
void ConstMult5(uint8_t *V, size_t N, uint8_t digit)
{
    uint8_t CARRY = 0;
    static uint8_t table[10000][4];
    static int init = 1;
    if(init)
    {
        for(int i=0 ; i<10000 ; ++i)
        {
            table[i][0] = (i / 1) % 10;
            table[i][1] = (i / 10) % 10;
            table[i][2] = (i / 100) % 10;
            table[i][3] = (i / 1000) % 10;
        }
        init = 0;
    }
    __m128i coefs1 = _mm_set_epi16(1000, 100, 10, 1, 1000, 100, 10, 1);
    __m128i coefs2 = _mm_set_epi32(10000*digit, 10000*digit, digit, digit);
    for(size_t i=0 ; i<N/16*16 ; i+=8)
    {
        // Require SSE 4.1 (thus smmintrin.h need to be included)
        const __m128i vBlock = _mm_loadu_si128((const __m128i*)&V[i]); // load 16 x uint8_t values (only half is used)
        const __m128i v = _mm_cvtepu8_epi16(vBlock); // Convert the block to 8 x int16_t values
        const __m128i tmp1 = _mm_madd_epi16(v, coefs1); // Compute the sum of adjacent pairs of v * coefs1 and put this in 4 x int32_t values
        const __m128i tmp2 = _mm_add_epi32(tmp1, _mm_shuffle_epi32(tmp1, 0b10110001)); // Horizontal partial sum of 4 x int32_t values
        const __m128i tmp3 = _mm_mul_epu32(tmp2, coefs2); // Compute tmp2 * coefs2 and put this in 2 x int64_t values
        const uint32_t val = _mm_extract_epi64(tmp3, 1) + _mm_extract_epi64(tmp3, 0) + CARRY; // Final horizontal sum with CARRY
        CARRY = val / 100000000;
        const uint32_t loVal = val % 10000;
        const uint32_t hiVal = val / 10000 - CARRY * 10000;
        const uint8_t* loTablePtr = &table[loVal][0];
        const uint8_t* hiTablePtr = &table[hiVal][0];
        // See the memcpy remark in the code above (alternative version).
        memcpy(V+i, loTablePtr, 4);
        memcpy(V+i+4, hiTablePtr, 4);
    }
    for(size_t i=N/16*16 ; i<N ; ++i)
    {
        V[i] = V[i] * digit + CARRY;
        CARRY = V[i] / 10;
        V[i] -= CARRY * 10;
    }
}
Here are performance results (repeated and averaged on 1000 run using random inputs) on my machine (with a i7-9700KF processor):
ConstMult0(10000): 11.702 us
ConstMult3(10000): 6.768 us (last optimized version)
ConstMult4(10000): 3.569 us
ConstMult5(10000): 2.552 us
The final SSE-based version is 4.6 times faster than your original implementation!