you can not full swap stack in current thread (allocate self, delete old) in library code because in old stack - return addresses, may be pointers to variables in stack, etc. 
and you can not expand stack (virtual memory for it already allocated (reserved/commit) and not expandable. 
however possible allocate temporary stack and switch to this stack during call. you must in this case save old StackBase and StackLimit from NT_TIB (look this structure in winnt.h), set new values (you need allocate memory for new stack), do call (for switch stack you need some assembly code - you can not do this only on c/c++) and return original StackBase and StackLimit. in kernelmode exist support for this - KeExpandKernelStackAndCallout
however in user mode exist Fibers - this is very rare used, but look like perfectly match to task. with Fiber we can create additional stack/execution context inside current thread. 
so in general solution is next (for library):
on DLL_THREAD_ATTACH :
- convert thread to fiber
(ConvertThreadToFiber) (if it returnfalsecheck alsoGetLastErrorforERROR_ALREADY_FIBER- this is also ok code)
- and create own Fiber by call CreateFiberEx
we do this only once. than, every time when your procedure is called, which require large stack space: 
- remember the current fiber by call GetCurrentFiber
- setup task for your fiber
- switch to your fiber by call SwitchToFiber
- call procedure inside fiber
- return to original fiber (saved from call GetCurrentFiber)
again bySwitchToFiber
and finally on DLL_THREAD_DETACH you need:
- delete your fiber by DeleteFiber
- convert fiber to thread by call ConvertFiberToThreadbut only
in case initialConvertThreadToFiberreturntrue(if wasERROR_ALREADY_FIBER- let who first convert thread to fiber convert
it back - this is not your task in this case)
you need some (usual small) data associated with your fiber / thread. this must be of course per thread variable. so you need use __declspec(thread) for declare this data. or direct use TLS (or which modern c++ features exist for this)
demo implementation is next:
typedef ULONG (WINAPI * MY_EXPAND_STACK_CALLOUT) (PVOID Parameter);
class FIBER_DATA 
{
public:
    PVOID _PrevFiber, _MyFiber;
    MY_EXPAND_STACK_CALLOUT _pfn;
    PVOID _Parameter;
    ULONG _dwError;
    BOOL _bConvertToThread;
    static VOID CALLBACK _FiberProc( PVOID lpParameter)
    {
        reinterpret_cast<FIBER_DATA*>(lpParameter)->FiberProc();
    }
    VOID FiberProc()
    {
        for (;;)
        {
            _dwError = _pfn(_Parameter);
            SwitchToFiber(_PrevFiber);
        }
    }
public:
    ~FIBER_DATA()
    {
        if (_MyFiber)
        {
            DeleteFiber(_MyFiber);
        }
        if (_bConvertToThread)
        {
            ConvertFiberToThread();
        }
    }
    FIBER_DATA()
    {
        _bConvertToThread = FALSE, _MyFiber = 0;
    }
    ULONG Create(SIZE_T dwStackCommitSize, SIZE_T dwStackReserveSize);
    ULONG DoCallout(MY_EXPAND_STACK_CALLOUT pfn, PVOID Parameter)
    {
        _PrevFiber = GetCurrentFiber();
        _pfn = pfn;
        _Parameter = Parameter;
        SwitchToFiber(_MyFiber);
        return _dwError;
    }
};
__declspec(thread) FIBER_DATA* g_pData;
ULONG FIBER_DATA::Create(SIZE_T dwStackCommitSize, SIZE_T dwStackReserveSize)
{
    if (ConvertThreadToFiber(this))
    {
        _bConvertToThread = TRUE;
    }
    else
    {
        ULONG dwError = GetLastError();
        if (dwError != ERROR_ALREADY_FIBER)
        {
            return dwError;
        }
    }
    return (_MyFiber = CreateFiberEx(dwStackCommitSize, dwStackReserveSize, 0, _FiberProc, this)) ? NOERROR : GetLastError();
}
void OnDetach()
{
    if (FIBER_DATA* pData = g_pData)
    {
        delete pData;
    }
}
ULONG OnAttach()
{
    if (FIBER_DATA* pData = new FIBER_DATA)
    {
        if (ULONG dwError = pData->Create(2*PAGE_SIZE, 512 * PAGE_SIZE))
        {
            delete pData;
            return dwError;
        }
        g_pData = pData;
        return NOERROR;
    }
    return ERROR_NO_SYSTEM_RESOURCES;
}
ULONG WINAPI TestCallout(PVOID param)
{
    DbgPrint("TestCallout(%s)\n", param);
    return NOERROR;
}
ULONG DoCallout(MY_EXPAND_STACK_CALLOUT pfn, PVOID Parameter)
{
    if (FIBER_DATA* pData = g_pData)
    {
        return pData->DoCallout(pfn, Parameter);
    }
    return ERROR_GEN_FAILURE;
}
if (!OnAttach())//DLL_THREAD_ATTACH
{
    DoCallout(TestCallout, "Demo Task #1");
    DoCallout(TestCallout, "Demo Task #2");
    OnDetach();//DLL_THREAD_DETACH
}
also note that all fibers executed in single thread context - multiple fibers associated with thread can not execute in concurrent - only sequential, and you yourself control switch time. so not need any additional synchronization. and SwitchToFiber - this is complete user mode proc. which executed very fast, never fail (because never allocate any resources)
update
despite use __declspec(thread) FIBER_DATA* g_pData; more simply (less code), better for implementation direct use TlsGetValue / TlsSetValue and allocate FIBER_DATA on first call inside thread, but not for all threads. also __declspec(thread) not correct worked (not worked at all) in XP for dll. so some modification can be
at DLL_PROCESS_ATTACH allocate your TLS slot gTlsIndex = TlsAlloc();
and free it on DLL_PROCESS_DETACH 
if (gTlsIndex != TLS_OUT_OF_INDEXES) TlsFree(gTlsIndex);
on every DLL_THREAD_DETACH notification call
void OnThreadDetach()
{
    if (FIBER_DATA* pData = (FIBER_DATA*)TlsGetValue(gTlsIndex))
    {
        delete pData;
    }
}
and DoCallout need be modified in next way
ULONG DoCallout(MY_EXPAND_STACK_CALLOUT pfn, PVOID Parameter)
{
    FIBER_DATA* pData = (FIBER_DATA*)TlsGetValue(gTlsIndex);
    if (!pData)
    {
        // this code executed only once on first call
        if (!(pData = new FIBER_DATA))
        {
            return ERROR_NO_SYSTEM_RESOURCES;
        }
        if (ULONG dwError = pData->Create(512*PAGE_SIZE, 4*PAGE_SIZE))// or what stack size you need
        {
            delete pData;
            return dwError;
        }
        TlsSetValue(gTlsIndex, pData);
    }
    return pData->DoCallout(pfn, Parameter);
}
so instead allocate stack for every new thread on DLL_THREAD_ATTACH via OnAttach() much better alocate it only for threads when really need (at first call)
and this code can potential have problems with fibers, if someone else also try use fibers. say in msdn example code not check for ERROR_ALREADY_FIBER in case ConvertThreadToFiber return 0. so we can wait that this case will be incorrect handled by main application if we before it decide create fiber and it also try use fiber after us. also ERROR_ALREADY_FIBER not worked in xp (begin from vista). 
so possible and another solution - yourself create thread stack, and temporary switch to it doring call which require large stack space. main need not only allocate space for stack and swap esp (or rsp) but not forget correct establish StackBase and StackLimit in NT_TIB - it is necessary and sufficient condition (otherwise exceptions and guard page extension will be not worked). 
despite this alternate solution require more code (manually create thread stack and stack switch) it will be work on xp too and nothing affect in situation when somebody else also try using fibers in thread
typedef ULONG (WINAPI * MY_EXPAND_STACK_CALLOUT) (PVOID Parameter);
extern "C" PVOID __fastcall SwitchToStack(PVOID param, PVOID stack);
struct FIBER_DATA
{
    PVOID _Stack, _StackLimit, _StackPtr, _StackBase;
    MY_EXPAND_STACK_CALLOUT _pfn;
    PVOID _Parameter;
    ULONG _dwError;
    static void __fastcall FiberProc(FIBER_DATA* pData, PVOID stack)
    {
        for (;;)
        {
            pData->_dwError = pData->_pfn(pData->_Parameter);
            // StackLimit can changed during _pfn call
            pData->_StackLimit = ((PNT_TIB)NtCurrentTeb())->StackLimit;
            stack = SwitchToStack(0, stack);
        }
    }
    ULONG Create(SIZE_T Reserve, SIZE_T Commit);
    ULONG DoCallout(MY_EXPAND_STACK_CALLOUT pfn, PVOID Parameter)
    {
        _pfn = pfn;
        _Parameter = Parameter;
        PNT_TIB tib = (PNT_TIB)NtCurrentTeb();
        PVOID StackBase = tib->StackBase, StackLimit = tib->StackLimit;
        tib->StackBase = _StackBase, tib->StackLimit = _StackLimit;
        _StackPtr = SwitchToStack(this, _StackPtr);
        tib->StackBase = StackBase, tib->StackLimit = StackLimit;
        return _dwError;
    }
    ~FIBER_DATA()
    {
        if (_Stack)
        {
            VirtualFree(_Stack, 0, MEM_RELEASE);
        }
    }
    FIBER_DATA()
    {
        _Stack = 0;
    }
};
ULONG FIBER_DATA::Create(SIZE_T Reserve, SIZE_T Commit)
{
    Reserve = (Reserve + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
    Commit = (Commit + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
    if (Reserve < Commit || !Reserve)
    {
        return ERROR_INVALID_PARAMETER;
    }
    if (PBYTE newStack = (PBYTE)VirtualAlloc(0, Reserve, MEM_RESERVE, PAGE_NOACCESS))
    {
        union {
            PBYTE newStackBase;
            void** ppvStack;
        };
        newStackBase = newStack + Reserve;
        PBYTE newStackLimit = newStackBase - Commit;
        if (newStackLimit = (PBYTE)VirtualAlloc(newStackLimit, Commit, MEM_COMMIT, PAGE_READWRITE))
        {
            if (Reserve == Commit || VirtualAlloc(newStackLimit - PAGE_SIZE, PAGE_SIZE, MEM_COMMIT, PAGE_READWRITE|PAGE_GUARD))
            {
                _StackBase = newStackBase, _StackLimit = newStackLimit, _Stack = newStack;
#if defined(_M_IX86) 
                *--ppvStack = FiberProc;
                ppvStack -= 4;// ebp,esi,edi,ebx
#elif defined(_M_AMD64)
                ppvStack -= 5;// x64 space
                *--ppvStack = FiberProc;
                ppvStack -= 8;// r15,r14,r13,r12,rbp,rsi,rdi,rbx
#else
#error "not supported"
#endif
                _StackPtr = ppvStack;
                return NOERROR;
            }
        }
        VirtualFree(newStack, 0, MEM_RELEASE);
    }
    return GetLastError();
}
ULONG gTlsIndex;
ULONG DoCallout(MY_EXPAND_STACK_CALLOUT pfn, PVOID Parameter)
{
    FIBER_DATA* pData = (FIBER_DATA*)TlsGetValue(gTlsIndex);
    if (!pData)
    {
        // this code executed only once on first call
        if (!(pData = new FIBER_DATA))
        {
            return ERROR_NO_SYSTEM_RESOURCES;
        }
        if (ULONG dwError = pData->Create(512*PAGE_SIZE, 4*PAGE_SIZE))
        {
            delete pData;
            return dwError;
        }
        TlsSetValue(gTlsIndex, pData);
    }
    return pData->DoCallout(pfn, Parameter);
}
void OnThreadDetach()
{
    if (FIBER_DATA* pData = (FIBER_DATA*)TlsGetValue(gTlsIndex))
    {
        delete pData;
    }
}
and assembly code for SwitchToStack : on x86
@SwitchToStack@8 proc
    push    ebx
    push    edi
    push    esi
    push    ebp
    xchg    esp,edx
    mov     eax,edx
    pop     ebp
    pop     esi
    pop     edi
    pop     ebx
    ret
@SwitchToStack@8 endp
and for x64:
SwitchToStack proc
    push    rbx
    push    rdi
    push    rsi
    push    rbp
    push    r12
    push    r13
    push    r14
    push    r15
    xchg    rsp,rdx
    mov     rax,rdx
    pop     r15
    pop     r14
    pop     r13
    pop     r12
    pop     rbp
    pop     rsi
    pop     rdi
    pop     rbx
    ret
SwitchToStack endp
usage/test can be next:
gTlsIndex = TlsAlloc();//DLL_PROCESS_ATTACH
if (gTlsIndex != TLS_OUT_OF_INDEXES)
{
    TestStackMemory();
    DoCallout(TestCallout, "test #1");
    //play with stack, excepions, guard pages
    PSTR str = (PSTR)alloca(256);
    DoCallout(zTestCallout, str);
    DbgPrint("str=%s\n", str);
    DoCallout(TestCallout, "test #2");
    OnThreadDetach();//DLL_THREAD_DETACH
    TlsFree(gTlsIndex);//DLL_PROCESS_DETACH
}
void TestMemory(PVOID AllocationBase)
{
    MEMORY_BASIC_INFORMATION mbi;
    PVOID BaseAddress = AllocationBase;
    while (VirtualQuery(BaseAddress, &mbi, sizeof(mbi)) >= sizeof(mbi) && mbi.AllocationBase == AllocationBase)
    {
        BaseAddress = (PBYTE)mbi.BaseAddress + mbi.RegionSize;
        DbgPrint("[%p, %p) %p %08x %08x\n", mbi.BaseAddress, BaseAddress, (PVOID)(mbi.RegionSize >> PAGE_SHIFT), mbi.State, mbi.Protect);
    }
}
void TestStackMemory()
{
    MEMORY_BASIC_INFORMATION mbi;
    if (VirtualQuery(_AddressOfReturnAddress(), &mbi, sizeof(mbi)) >= sizeof(mbi))
    {
        TestMemory(mbi.AllocationBase);
    }
}
ULONG WINAPI zTestCallout(PVOID Parameter)
{
    TestStackMemory();
    alloca(5*PAGE_SIZE);
    TestStackMemory();
    __try
    {
        *(int*)0=0;
    } 
    __except(EXCEPTION_EXECUTE_HANDLER)
    {
        DbgPrint("exception %x handled\n", GetExceptionCode());
    }
    strcpy((PSTR)Parameter, "zTestCallout demo");
    return NOERROR;
}
ULONG WINAPI TestCallout(PVOID param)
{
    TestStackMemory();
    DbgPrint("TestCallout(%s)\n", param);
    return NOERROR;
}