definitely yes!
Here is demonstration of a CRC-32 calculation which I wrote in C++, then optimized in x86 assembler using Visual Studio.
InitCRC32Table() should be called at program start.
CalcCRC32() will calculate the CRC for a given memory block.
Both function are implemented both in assembler and C++.
On a typical pentium machine, you will notice that the assembler CalcCRC32() function is 50% faster then the C++ code.
The assembler implementation is not MMX or SSE, but simple x86 code.
The compiler will never produce a code that is as efficient as a manually crafted assembler code.
    DWORD* panCRC32Table = NULL; // CRC-32 CCITT 0x04C11DB7
    void DoneCRCTables()
    {
        if (panCRC32Table )
        {
            delete[] panCRC32Table;
            panCRC32Table= NULL;
        }
    }
    void InitCRC32Table()
    {
        if (panCRC32Table) return;
        panCRC32Table= new DWORD[256];
        atexit(DoneCRCTables);
    /*
        for (int bx=0; bx<256; bx++)
        {
            DWORD eax= bx;
            for (int cx=8; cx>0; cx--)
                if (eax & 1)
                    eax= (eax>>1) ^ 0xEDB88320;
                else
                    eax= (eax>>1)             ;
            panCRC32Table[bx]= eax;
        }
    */
            _asm cld
            _asm mov    edi, panCRC32Table
            _asm xor    ebx, ebx
        p0: _asm mov    eax, ebx
            _asm mov    ecx, 8
        p1: _asm shr    eax, 1
            _asm jnc    p2
            _asm xor    eax, 0xEDB88320           // bit-swapped 0x04C11DB7
        p2: _asm loop   p1
            _asm stosd
            _asm inc    bl
            _asm jnz    p0
    }
/*
DWORD inline CalcCRC32(UINT nLen, const BYTE* cBuf, DWORD nInitVal= 0)
{
    DWORD crc= ~nInitVal;
    for (DWORD n=0; n<nLen; n++)
        crc= (crc>>8) ^ panCRC32Table[(crc & 0xFF) ^ cBuf[n]];
    return ~crc;
}
*/
DWORD inline __declspec (naked) __fastcall CalcCRC32(UINT        nLen       ,
                                                     const BYTE* cBuf       ,
                                                     DWORD       nInitVal= 0 ) // used to calc CRC of chained bufs
{
        _asm mov    eax, [esp+4]         // param3: nInitVal
        _asm jecxz  p2                   // __fastcall param1 ecx: nLen
        _asm not    eax
        _asm push   esi
        _asm push   ebp
        _asm mov    esi, edx             // __fastcall param2 edx: cBuf
        _asm xor    edx, edx
        _asm mov    ebp, panCRC32Table
        _asm cld
    p1: _asm mov    dl , al
        _asm shr    eax, 8
        _asm xor    dl , [esi]
        _asm xor    eax, [ebp+edx*4]
        _asm inc    esi
        _asm loop   p1
        _asm pop    ebp
        _asm pop    esi
        _asm not    eax
    p2: _asm ret    4                    // eax- returned value. 4 because there is 1 param in stack
}
// test code:
#include "mmSystem.h"                      // timeGetTime
#pragma comment(lib, "Winmm.lib" )
InitCRC32Table();
BYTE* x= new BYTE[1000000];
for (int i= 0; i<1000000; i++) x[i]= 0;
DWORD d1= ::timeGetTime();
for (i= 0; i<1000; i++)
    CalcCRC32(1000000, x, 0);
DWORD d2= ::timeGetTime();
TRACE("%d\n", d2-d1);