Finally got it. 
The idea is to draw the marker as white box on a black image. Then crop the image that we want and draw it in a new image. Since the correct size for the new image is unknown, we just set the size as square. The new image should be black image with white boxes at the corner. Starting from (0,0) we then cross the image and check for the pixel value. The pixel value should be white. If the pixel value is black, we are outside the white box. Trace back the pixel value along x and y because the white box might be tall or wide. Once we find the bottom right of the white box, we have the size of the white box. Rescale this white box to square. Use the same function to rescale the image.
This is the image captured by camera

Draw the marker as white box in a black image.

Crop and warped into a square.

Get the width and height of the white box in top left corner.
Once we have the scale function, apply it.

In case anyone interested, here are the codes.
// Get3dRectFrom2d.cpp : This file contains the 'main' function. Program execution begins and ends there.
//
#include "pch.h"
#include <iostream>
#include <opencv2/opencv.hpp>
#include <opencv2/aruco.hpp>
#define CAMERA_WINDOW "Simple ArUco"
using namespace std;
using namespace cv;
static bool readCameraParameters(string filename, Mat &camMatrix, Mat &distCoeffs) {
    FileStorage fs(filename, FileStorage::READ);
    if (!fs.isOpened())
        return false;
    fs["camera_matrix"] >> camMatrix;
    fs["distortion_coefficients"] >> distCoeffs;
    return true;
}
int main()
{
    Mat camMatrix, distCoeffs;
    string cameraSettings = "camera.txt";
    bool estimatePose = false;
    bool showRejected = true;
    if (readCameraParameters(cameraSettings, camMatrix, distCoeffs))
    {
        estimatePose = true;
    }
    Ptr<aruco::Dictionary> dictionary =
        aruco::getPredefinedDictionary(aruco::PREDEFINED_DICTIONARY_NAME(aruco::DICT_4X4_50));
    Ptr<aruco::DetectorParameters> detectorParams = aruco::DetectorParameters::create();
    float markerLength = 3.75f;
    float markerSeparation = 0.5f;
    double totalTime = 0;
    int totalIterations = 0;
    VideoCapture inputVideo(0);
    if (!inputVideo.isOpened())
    {
        cout << "cannot open camera";
    }
    double prevW = -1, prevH = -1;
    double increment = 0.1;
    while (inputVideo.grab())
    {
        Mat image, imageCopy;
        inputVideo.retrieve(image);
        double tick = (double)getTickCount();
        vector< int > ids;
        vector< vector< Point2f > > corners, rejected;
        vector< Vec3d > rvecs, tvecs;
        // detect markers and estimate pose
        aruco::detectMarkers(image, dictionary, corners, ids, detectorParams, rejected);
        if (estimatePose && ids.size() > 0)
            aruco::estimatePoseSingleMarkers(corners, markerLength, camMatrix, distCoeffs, rvecs,
                tvecs);
        double currentTime = ((double)getTickCount() - tick) / getTickFrequency();
        totalTime += currentTime;
        totalIterations++;
        if (totalIterations % 30 == 0) {
            cout << "Detection Time = " << currentTime * 1000 << " ms "
                << "(Mean = " << 1000 * totalTime / double(totalIterations) << " ms)" << endl;
        }
        // draw results
        image.copyTo(imageCopy);
        if (ids.size() > 0) {
            aruco::drawDetectedMarkers(imageCopy, corners, ids);
            if (estimatePose) {
                for (unsigned int i = 0; i < ids.size(); i++)
                    aruco::drawAxis(imageCopy, camMatrix, distCoeffs, rvecs[i], tvecs[i],
                        markerLength * 0.5f);
            }
        }
        if (ids.size() == 4)
        {
            if (true)
            {
                // process the image
                array<Point2f, 4> srcCorners;           // corner that we want 
                array<Point2f, 4> dstCorners;           // destination corner   
                vector<Point> marker0;          // marker corner
                vector<Point> marker1;          // marker corner
                vector<Point> marker2;          // marker corner
                vector<Point> marker3;          // marker corner
                //id  8 14 18 47
                for (size_t i = 0; i < ids.size(); i++)
                {
                    // first corner
                    if (ids[i] == 8)
                    {
                        srcCorners[0] = corners[i][0];      // get the first point
                        //srcCornersSmall[0] = corners[i][2];
                        marker0.push_back(corners[i][0]);
                        marker0.push_back(corners[i][1]);
                        marker0.push_back(corners[i][2]);
                        marker0.push_back(corners[i][3]);
                    }
                    // second corner
                    else if (ids[i] == 14)
                    {
                        srcCorners[1] = corners[i][1];      // get the second point
                        //srcCornersSmall[1] = corners[i][3];
                        marker1.push_back(corners[i][0]);
                        marker1.push_back(corners[i][1]);
                        marker1.push_back(corners[i][2]);
                        marker1.push_back(corners[i][3]);
                    }
                    // third corner
                    else if (ids[i] == 18)
                    {
                        srcCorners[2] = corners[i][2];      // get the thirt point
                        //srcCornersSmall[2] = corners[i][0];
                        marker2.push_back(corners[i][0]);
                        marker2.push_back(corners[i][1]);
                        marker2.push_back(corners[i][2]);
                        marker2.push_back(corners[i][3]);
                    }
                    // fourth corner
                    else if (ids[i] == 47)
                    {
                        srcCorners[3] = corners[i][3];      // get the fourth point
                        //srcCornersSmall[3] = corners[i][1];
                        marker3.push_back(corners[i][0]);
                        marker3.push_back(corners[i][1]);
                        marker3.push_back(corners[i][2]);
                        marker3.push_back(corners[i][3]);
                    }
                }
                // create a black image with the same size of cam image
                Mat mask = Mat::zeros(imageCopy.size(), CV_8UC1);
                Mat dstImage = Mat::zeros(imageCopy.size(), CV_8UC1);
                // draw white fill on marker corners
                {
                    int num = (int)marker0.size();
                    if (num != 0)
                    {
                        const Point * pt4 = &(marker0[0]);
                        fillPoly(mask, &pt4, &num, 1, Scalar(255, 255, 255), 8);
                    }
                }
                {
                    int num = (int)marker1.size();
                    if (num != 0)
                    {
                        const Point * pt4 = &(marker1[0]);
                        fillPoly(mask, &pt4, &num, 1, Scalar(255, 255, 255), 8);
                    }
                }
                {
                    int num = (int)marker2.size();
                    if (num != 0)
                    {
                        const Point * pt4 = &(marker2[0]);
                        fillPoly(mask, &pt4, &num, 1, Scalar(255, 255, 255), 8);
                    }
                }
                {
                    int num = (int)marker3.size();
                    if (num != 0)
                    {
                        const Point * pt4 = &(marker3[0]);
                        fillPoly(mask, &pt4, &num, 1, Scalar(255, 255, 255), 8);
                    }
                }
                // draw the mask
                imshow("black white lines", mask);
                // we dont have the correct size/aspect ratio
                double width = 256.0f, height = 256.0f;
                dstCorners[0] = Point2f(0.0f, 0.0f);
                dstCorners[1] = Point2f(width, 0.0f);
                dstCorners[2] = Point2f(width, height);
                dstCorners[3] = Point2f(0.0f, height);
                // get perspectivetransform
                Mat M = getPerspectiveTransform(srcCorners, dstCorners);
                // warp perspective
                Mat dst;
                Size dsize = Size(cvRound(dstCorners[2].x), cvRound(dstCorners[2].y));
                warpPerspective(mask, dst, M, dsize);
                // show warped image
                imshow("perspective transformed", dst);
                // get width and length of the first marker
                // start from (0,0) and cross 
                int cx = 0, cy = 0; // track our current coordinate
                Scalar v, vx, vy; // pixel value at coordinate
                bool cont = true;
                while (cont)
                {
                    v = dst.at<uchar>(cx, cy); // get pixel value at current coordinate
                    if (cx > 1 && cy > 1) 
                    {
                        vx = dst.at<uchar>(cx - 1, cy);
                        vy = dst.at<uchar>(cx, cy - 1);
                    }
                    // if pixel not black, continue crossing
                    if ((int)v.val[0] != 0)
                    {
                        cx++;
                        cy++;
                    }
                    // current pixel is black
                    // if previous y pixel is not black, means that we need to walk the pixel right
                    else if ((int)((Scalar)dst.at<uchar>(cx, cy - 1)).val[0] != 0)
                    {
                        cx = cx + 1;
                    }
                    // if previous x pixel is not black, means that we need to walk the pixel down
                    else if ((int)((Scalar)dst.at<uchar>(cx - 1, cy)).val[0] != 0)
                    {
                        cy = cy + 1;
                    }
                    // the rest is the same with previous 2, only with higher previous pixel to check
                    // need to do this because sometimes pixels is jagged
                    else if ((int)((Scalar)dst.at<uchar>(cx, cy - 2)).val[0] != 0)
                    {
                        cx = cx + 1;
                    }
                    else if ((int)((Scalar)dst.at<uchar>(cx - 2, cy)).val[0] != 0)
                    {
                        cy = cy + 1;
                    }
                    else if ((int)((Scalar)dst.at<uchar>(cx, cy - 3)).val[0] != 0)
                    {
                        cx = cx + 1;
                    }
                    else if ((int)((Scalar)dst.at<uchar>(cx - 3, cy)).val[0] != 0)
                    {
                        cy = cy + 1;
                    }
                    else if ((int)((Scalar)dst.at<uchar>(cx, cy - 4)).val[0] != 0)
                    {
                        cx = cx + 1;
                    }
                    else if ((int)((Scalar)dst.at<uchar>(cx - 4, cy)).val[0] != 0)
                    {
                        cy = cy + 1;
                    }
                    else if ((int)((Scalar)dst.at<uchar>(cx, cy - 5)).val[0] != 0)
                    {
                        cx = cx + 1;
                    }
                    else if ((int)((Scalar)dst.at<uchar>(cx - 5, cy)).val[0] != 0)
                    {
                        cy = cy + 1;
                    }
                    else
                    {
                        cx = cx - 1;
                        cy = cy - 1;
                        cont = false;
                    }
                    // reached the end of the picture
                    if (cx >= dst.cols)
                    {
                        cont = false;
                    }
                    else if (cy >= dst.rows)
                    {
                        cont = false;
                    }
                }
                if (cx == cy)
                {
                    //we have perfect square
                }
                if (cx > cy)
                {
                    // wide
                    width = (height * ((double)cx / (double)cy));
                }
                else
                {
                    // tall
                    height = (width * ((double)cy / (double)cx));
                }
                // we dont want the size varied too much every frame, 
                // so limits the increment or decrement for every frame
                // initialize first usage
                if (prevW<0)
                {
                    prevW = width;
                }
                if (prevH<0)
                {
                    prevH = height;
                }
                if (width > prevW + increment)
                {
                    width = prevW + increment;
                }
                else if (width < prevW - increment)
                {
                    width = prevW - increment;
                }
                prevW = width;
                if (height > prevH + increment)
                {
                    height = prevH + increment;
                }
                else if (height < prevH - increment)
                {
                    height = prevH - increment;
                }
                prevH = height;
                // show resized image
                Size s(width, height);
                Mat resized;
                resize(dst, resized, s);
                imshow("resized", resized);
            }
        }
        if (showRejected && rejected.size() > 0)
            aruco::drawDetectedMarkers(imageCopy, rejected, noArray(), Scalar(100, 0, 255));
        imshow("out", imageCopy);
        if (waitKey(1) == 27) {
            break;
        }
    }
    cout << "Hello World!\n";
    cin.ignore();
    return 0;
}
I'm more interested in a mathematical solution but for now, this suffice. If you guys know a much better approach(faster) let me know.