We are experiencing problems while using cuSOLVER's cusolverSpScsrlsvchol function, probably due to misunderstanding of the cuSOLVER library.
Motivation: we are solving the Poisson equation -divgrad x = b on a rectangular grid. In 2 dimensions with a 5-stencil (1, 1, -4, 1, 1), the Laplacian on the grid provides a (quite sparse) matrix A. Moreover, the charge distribution on the grid gives a (dense) vector b. A is positive definite and symmetric. 
Now we solve A * x = b for x using nvidia's new cuSOLVER library that comes with CUDA 7.0 . It provides a function cusolverSpScsrlsvchol which should do the sparse Cholesky factorisation for floats.
Note: we are able to correctly solve the system with the alternative sparse QR factorisation function cusolverSpScsrlsvqr. For a 4 x 4 grid with all b entries on the edge being 1 and the rest 0, we get for x:
1 1 0.999999 1 1 1 0.999999 1 1 1 1 1 1 1 1 1 
Our problems:
- cusolverSpScsrlsvcholreturns wrong results for- x:- 1 3.33333 2.33333 1 3.33333 2.33333 1.33333 1 2.33333 1.33333 0.666667 1 1 1 1 1
- (solved, see answer below) Converting the CSR matrix - Ato a dense matrix and showing the output gives weird numbers (- 10^-44and the like). The respective data from the CSR format are correct and validated with python numpy.
- (solved, see answer below) The alternative sparse - LUand partial pivoting with- cusolverSpScsrlsvlucannot even be found:- $ nvcc -std=c++11 cusparse_test3.cu -o cusparse_test3 -lcusparse -lcusolver cusparse_test3.cu(208): error: identifier "cusolverSpScsrlsvlu" is undefined
What are we doing wrong? Thanks for your help!
Our C++ CUDA code:
#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cusolverSp.h>
#include <cusparse.h>
#include <vector>
#include <cassert>
// create poisson matrix with Dirichlet bc. of a rectangular grid with
// dimension NxN
void assemble_poisson_matrix_coo(std::vector<float>& vals, std::vector<int>& row, std::vector<int>& col,
                     std::vector<float>& rhs, int Nrows, int Ncols) {
        //nnz: 5 entries per row (node) for nodes in the interior
    // 1 entry per row (node) for nodes on the boundary, since we set them explicitly to 1.
    int nnz = 5*Nrows*Ncols - (2*(Ncols-1) + 2*(Nrows-1))*4;
    vals.resize(nnz);
    row.resize(nnz);
    col.resize(nnz);
    rhs.resize(Nrows*Ncols);
    int counter = 0;
    for(int i = 0; i < Nrows; ++i) {
        for (int j = 0; j < Ncols; ++j) {
            int idx = j + Ncols*i;
            if (i == 0 || j == 0 || j == Ncols-1 || i == Nrows-1) {
                vals[counter] = 1.;
                row[counter] = idx;
                col[counter] = idx;
                counter++;
                rhs[idx] = 1.;
//                if (i == 0) {
//                    rhs[idx] = 3.;
//                }
            } else { // -laplace stencil
                // above
                vals[counter] = -1.;
                row[counter] = idx;
                col[counter] = idx-Ncols;
                counter++;
                // left
                vals[counter] = -1.;
                row[counter] = idx;
                col[counter] = idx-1;
                counter++;
                // center
                vals[counter] = 4.;
                row[counter] = idx;
                col[counter] = idx;
                counter++;
                // right
                vals[counter] = -1.;
                row[counter] = idx;
                col[counter] = idx+1;
                counter++;
                // below
                vals[counter] = -1.;
                row[counter] = idx;
                col[counter] = idx+Ncols;
                counter++;
                rhs[idx] = 0;
            }
        }
    }
    assert(counter == nnz);
}
int main() {
    // --- create library handles:
    cusolverSpHandle_t cusolver_handle;
    cusolverStatus_t cusolver_status;
    cusolver_status = cusolverSpCreate(&cusolver_handle);
    std::cout << "status create cusolver handle: " << cusolver_status << std::endl;
    cusparseHandle_t cusparse_handle;
    cusparseStatus_t cusparse_status;
    cusparse_status = cusparseCreate(&cusparse_handle);
    std::cout << "status create cusparse handle: " << cusparse_status << std::endl;
    // --- prepare matrix:
    int Nrows = 4;
    int Ncols = 4;
    std::vector<float> csrVal;
    std::vector<int> cooRow;
    std::vector<int> csrColInd;
    std::vector<float> b;
    assemble_poisson_matrix_coo(csrVal, cooRow, csrColInd, b, Nrows, Ncols);
    int nnz = csrVal.size();
    int m = Nrows * Ncols;
    std::vector<int> csrRowPtr(m+1);
    // --- prepare solving and copy to GPU:
    std::vector<float> x(m);
    float tol = 1e-5;
    int reorder = 0;
    int singularity = 0;
    float *db, *dcsrVal, *dx;
    int *dcsrColInd, *dcsrRowPtr, *dcooRow;
    cudaMalloc((void**)&db, m*sizeof(float));
    cudaMalloc((void**)&dx, m*sizeof(float));
    cudaMalloc((void**)&dcsrVal, nnz*sizeof(float));
    cudaMalloc((void**)&dcsrColInd, nnz*sizeof(int));
    cudaMalloc((void**)&dcsrRowPtr, (m+1)*sizeof(int));
    cudaMalloc((void**)&dcooRow, nnz*sizeof(int));
    cudaMemcpy(db, b.data(), b.size()*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dcsrVal, csrVal.data(), csrVal.size()*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dcsrColInd, csrColInd.data(), csrColInd.size()*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dcooRow, cooRow.data(), cooRow.size()*sizeof(int), cudaMemcpyHostToDevice);
    cusparse_status = cusparseXcoo2csr(cusparse_handle, dcooRow, nnz, m,
                                       dcsrRowPtr, CUSPARSE_INDEX_BASE_ZERO);
    std::cout << "status cusparse coo2csr conversion: " << cusparse_status << std::endl;
    cudaDeviceSynchronize(); // matrix format conversion has to be finished!
    // --- everything ready for computation:
    cusparseMatDescr_t descrA;
    cusparse_status = cusparseCreateMatDescr(&descrA);
    std::cout << "status cusparse createMatDescr: " << cusparse_status << std::endl;
    // optional: print dense matrix that has been allocated on GPU
    std::vector<float> A(m*m, 0);
    float *dA;
    cudaMalloc((void**)&dA, A.size()*sizeof(float));
    cusparseScsr2dense(cusparse_handle, m, m, descrA, dcsrVal,
                       dcsrRowPtr, dcsrColInd, dA, m);
    cudaMemcpy(A.data(), dA, A.size()*sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << "A: \n";
    for (int i = 0; i < m; ++i) {
        for (int j = 0; j < m; ++j) {
            std::cout << A[i*m + j] << " ";
        }
        std::cout << std::endl;
    }
    cudaFree(dA);
    std::cout << "b: \n";
    cudaMemcpy(b.data(), db, (m)*sizeof(int), cudaMemcpyDeviceToHost);
    for (auto a : b) {
        std::cout << a << ",";
    }
    std::cout << std::endl;
    // --- solving!!!!
//    cusolver_status = cusolverSpScsrlsvchol(cusolver_handle, m, nnz, descrA, dcsrVal,
//                       dcsrRowPtr, dcsrColInd, db, tol, reorder, dx,
//                       &singularity);
     cusolver_status = cusolverSpScsrlsvqr(cusolver_handle, m, nnz, descrA, dcsrVal,
                        dcsrRowPtr, dcsrColInd, db, tol, reorder, dx,
                        &singularity);
    cudaDeviceSynchronize();
    std::cout << "singularity (should be -1): " << singularity << std::endl;
    std::cout << "status cusolver solving (!): " << cusolver_status << std::endl;
    cudaMemcpy(x.data(), dx, m*sizeof(float), cudaMemcpyDeviceToHost);
    // relocated these 2 lines from above to solve (2):
    cusparse_status = cusparseDestroy(cusparse_handle);
    std::cout << "status destroy cusparse handle: " << cusparse_status << std::endl;
    cusolver_status = cusolverSpDestroy(cusolver_handle);
    std::cout << "status destroy cusolver handle: " << cusolver_status << std::endl;
    for (auto a : x) {
        std::cout << a << " ";
    }
    std::cout << std::endl;
    cudaFree(db);
    cudaFree(dx);
    cudaFree(dcsrVal);
    cudaFree(dcsrColInd);
    cudaFree(dcsrRowPtr);
    cudaFree(dcooRow);
    return 0;
}
 
     
     
    