In MPI, sending the pointer to an object is not enougth. Unlike threads or openmp, the default behavior is parrallel. If you write imwrite(name, grey2 );,the image grey2 will be written size times. If you send pointer grey from 0 to 1, the grey pointer on proc 1 will point a memory owned by proc 0. This will probably create failures.
MPI offers you many ways to communicate belong MPI_Send() and MPI_Receive(). For instance, MPI_Bcast() is suitable to send the image from proc 0 to all procs. http://www.mcs.anl.gov/research/projects/mpi/www/www3/MPI_Bcast.html
I changed your code to use MPI_Bcast() by sending the size of the image first and then the data.
#include<iostream>
#include<mpi.h>
#include<cmath>
#include<opencv2/imgproc/imgproc.hpp>
#include<opencv2/highgui/highgui.hpp>
using namespace std;
using namespace cv;
//int mod(int z, int l);
int xGradient(Mat image, int x, int y)
{
    return ((int)(image.at<uchar>(y-1, x-1))) +
            2*image.at<uchar>(y, x-1) +
            image.at<uchar>(y+1, x-1) -
            image.at<uchar>(y-1, x+1) -
            2*image.at<uchar>(y, x+1) -
            image.at<uchar>(y+1, x+1);
}
int yGradient(Mat image, int x, int y)
{
    return ((int)(image.at<uchar>(y-1, x-1))) +
            2*image.at<uchar>(y-1, x) +
            image.at<uchar>(y-1, x+1) -
            image.at<uchar>(y+1, x-1) -
            2*image.at<uchar>(y+1, x) -
            image.at<uchar>(y+1, x+1);
}
int main()
{
    Mat src, grey, dst;
    Mat grey2;
    double start, end;
    int gx, gy, sum, argc, awal,akhir, size, rank;
    int master=0;
    char **argv;
    // MPI_Status status;
    awal= MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    // start=MPI_Wtime();
    cout<<"rank "<<rank<<endl;
    size_t total;
    size_t elemsize;
    int sizes[3];
    if( rank == master )
    {
        start=MPI_Wtime();
        src= imread("jari1.jpg");
        cvtColor(src,grey,CV_BGR2GRAY);
        src.release();
        //dst = grey.clone();
        imwrite("jari2.jpg", grey );
        cout<<"ok here"<<endl;
        if(!grey.isContinuous()){
            cout<<"trouble : data is not continuous"<<endl;
        }
        total=grey.total();
        sizes[2]=grey.elemSize();
        cv::Size s = grey.size();
        sizes[0] = s.height;
        sizes[1] = s.width;
        cout<<"grey is made of "<<total<<" elements of size "<<sizes[2]<<" that is "<<sizes[0]<<" by "<<sizes[1]<<endl;
        if( !grey.data )
        {
            return -1;
        }
        // MPI_Send(&grey, 1, MPI_LONG, 1, 1, MPI_COMM_WORLD);
        cout<<"master mengirim data ke rank 1"<<endl;
        //fflush (stdout);
    }
    /*else if (rank==1)
{
 MPI_Recv(&grey, 1, MPI_LONG, 0, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
  cout<<"rank 1 menerima data"<<endl;
}*/
    MPI_Bcast( sizes, 3, MPI_INT, 0,   MPI_COMM_WORLD);
    cout<<rank<<" : "<<sizes[0]<<" "<<sizes[1]<<endl;
    if(rank!=master){
        grey.create(sizes[0],sizes[1],CV_8U);
        if(!grey.data){
            cout<<"data not allocated, rank "<<rank<<endl;
        }else{
            cout<<" ok !"<<endl;
        }
    }
    MPI_Bcast( grey.data, sizes[0]*sizes[1], MPI_CHAR, 0,   MPI_COMM_WORLD);
    //for output
    grey2.create(sizes[0],sizes[1],CV_8U);
    char name[100];
    sprintf(name,"jari%d.jpg",rank+42+size);
    imwrite(name, grey );
    /*
for(int y = 0; y < grey.rows; y++)
    for(int x = 0; x < grey.cols; x++)
        grey.at<uchar>(y,x) = 0;  
     */
    int starty=(rank*grey.rows/size);
    if(starty==0)
    {starty=1;}
    int stopy=((rank+1)*grey.rows/size);
    if(stopy>grey.rows - 1)
    {stopy=grey.rows - 1;}
    for(int y = starty; y < stopy; y++)
    {
        for(int x = 1; x < grey.cols - 1; x++)
        {
            gx = xGradient(grey, x, y);
            gy = yGradient(grey, x, y);
            sum = abs(gx) + abs(gy);
            //cout<<sum<<endl;
            sum = sum > 255 ? 255:sum;
            sum = sum < 0 ? 0 : sum;
            grey2.at<uchar>(y,x) = sum;
            //cout<<sum<<endl;
        }
    }
    grey.release();
    //namedWindow("deteksi tepi sobel");
    //imshow("deteksi tepi sobel", dst);
    //namedWindow("grayscale");
    //imshow("grayscale", grey);
    //namedWindow("Original");
    //imshow("Original", src);
    sprintf(name,"jari%d.jpg",rank+42);
    imwrite(name, grey2 );
    grey2.release();
    //MPI_Barrier(MPI_COMM_WORLD);
    end=MPI_Wtime();
    cout<<"time: "<< end-start << " detik " <<endl;
    akhir=MPI_Finalize();
    //waitKey();
    return 0;
}
To retreive the data on proc 0, the MPI_Gatherv() function seems useful. http://www.mcs.anl.gov/research/projects/mpi/www/www3/MPI_Gatherv.html or http://mpi.deino.net/mpi_functions/MPI_Gatherv.html I let you go on with your code. You may need an extended look at a tutorials and examples...
Edit :
I largely changed the code and i wish this piece of code will end your quest...
I changed my mind and used MPI_Scatterv() to send a little part of the image on each proc. I also changed the computation of the gradient... And then i retreive the image on one proc using MPI_Gatherv() In the end, the overall speed up is low, because most part of it is spend opening and writting files. Moreover, such filters (and this code in particular...) need a large memory bandwith.
I fear that you did not fully understood how this first piece of code works. But this one is far from being clear...I had trouble with indexes...
#include<iostream>
#include<mpi.h>
#include<cmath>
#include<opencv2/imgproc/imgproc.hpp>
#include<opencv2/highgui/highgui.hpp>
using namespace std;
using namespace cv;
//int mod(int z, int l);
static inline int xGradient(Mat image, int x, int y)
{
    return ((int)(image.at<uchar>(y-1, x-1))) +
            2*image.at<uchar>(y, x-1) +
            image.at<uchar>(y+1, x-1) -
            image.at<uchar>(y-1, x+1) -
            2*image.at<uchar>(y, x+1) -
            image.at<uchar>(y+1, x+1);
}
static inline int yGradient(Mat image, int x, int y)
{
    return ((int)(image.at<uchar>(y-1, x-1))) +
            2*image.at<uchar>(y-1, x) +
            image.at<uchar>(y-1, x+1) -
            image.at<uchar>(y+1, x-1) -
            2*image.at<uchar>(y+1, x) -
            image.at<uchar>(y+1, x+1);
}
static inline int xGradientd(uchar* pt, int cols)
{
    return ((int)(pt[-cols+1])+2*pt[1]+pt[cols+1]-pt[-cols-1]-2*pt[-1]-pt[cols-1]);
}
static inline int yGradientd(uchar* pt, int cols )
{
    return ((int)(pt[cols-1])+2*pt[cols]+pt[cols+1]-pt[-cols-1]-2*pt[-cols]-pt[-cols+1]);
}
int main()
{
    Mat src, grey, dst;
    Mat grey2;
    Mat grey3;
    double start, end;
    int gx, gy, sum, argc, awal,akhir, size, rank;
    char **argv;
    // MPI_Status status;
    awal= MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    // start=MPI_Wtime();
    cout<<"rank "<<rank<<endl;
    size_t total;
    size_t elemsize;
    int sizes[3];
    if( rank == 0)
    {
        start=MPI_Wtime();
        src= imread("jari1.jpg");
        cvtColor(src,grey,CV_BGR2GRAY);
        src.release();
        //dst = grey.clone();
        imwrite("jari2.jpg", grey );
        cout<<"ok here"<<endl;
        if(!grey.isContinuous()){
            cout<<"trouble : data is not continuous"<<endl;
        }
        total=grey.total();
        sizes[2]=grey.elemSize();
        cv::Size s = grey.size();
        sizes[0] = s.height;
        sizes[1] = s.width;
        cout<<"grey is made of "<<total<<" elements of size "<<sizes[2]<<" that is "<<sizes[0]<<" by "<<sizes[1]<<endl;
        if( !grey.data )
        {
            return -1;
        }
        // MPI_Send(&grey, 1, MPI_LONG, 1, 1, MPI_COMM_WORLD);
        cout<<"master mengirim data ke rank 1"<<endl;
        //fflush (stdout);
    }
    //start of parallel part. To this point, only proc 0 was working.
    if( rank == 0 )
    {
        start=MPI_Wtime();
    }
    //the sizes of the image grey are send to all processus.
    MPI_Bcast( sizes, 3, MPI_INT, 0,   MPI_COMM_WORLD);
    //cout<<rank<<" : "<<sizes[0]<<" "<<sizes[1]<<endl;
    int recvcount[size];
    int displ[size];
    int i;
    //compute size of local image
    //on each proc, a little slice of the image will be received from proc 0 through MPI_Scatterv
    //to compute the gradient, two extra lines should be send on top and bottom of slice.(except for 0 and sizes-1)
    //this is why there are so many tests.
    //how many pixels on the slice ? sendcount.
    int sendcount=sizes[1]*(sizes[0]/size)+2*sizes[1];
    if(rank==size-1){
        sendcount=sizes[1]*(sizes[0]-(size-1)*(sizes[0]/size))+sizes[1];
    }
    if(rank==0){
        sendcount-=sizes[1];
    }
    //printf("creating image %d %d \n",sendcount/sizes[1],sizes[1]);
    //image allocation : 
    grey3.create(sendcount/sizes[1],sizes[1],CV_8U);
    if(!grey3.data){
        cout<<"data not allocated, rank "<<rank<<endl;
    }else{
        //cout<<" ok !"<<endl;
    }
    //compute sizes and offsets on proc 0
    //how many char should be sent from proc 0 to proc i ? recvcount[i].
    //where does the data starts ? displ[i]. 
    //these information are needed by MPI_Scatterv() on proc 0
    if(rank==0){
        displ[0]=0;
        for(i=0;i<size;i++){
            recvcount[i]=grey.cols*(grey.rows/size)+grey.cols;
            if(i>0){
                recvcount[i]+=grey.cols;
            }
            if(i>0){
                displ[i]=recvcount[i-1]+displ[i-1]-2*grey.cols;
            }
        }
        recvcount[size-1]=grey.cols*(grey.rows-(size-1)*(grey.rows/size));
        if(size>1){
            recvcount[size-1]+=grey.cols;
        }
        if(size-1>0){
            displ[size-1]=grey.cols*(grey.rows)-recvcount[size-1];
        }
    }
    /*
if(rank==master){
for(i=0;i<size;i++){
printf("count %d displ %d \n",recvcount[i],displ[i]);
}
}
     */
    MPI_Scatterv( grey.data, recvcount, displ,  MPI_CHAR, grey3.data, sendcount,MPI_CHAR,0, MPI_COMM_WORLD);
    /*
char name[100];
sprintf(name,"jariscat%d.jpg",rank);
imwrite(name, grey3 );
     */
    //MPI_Bcast( grey.data, sizes[0]*sizes[1], MPI_CHAR, 0,   MPI_COMM_WORLD);
    //for output
    //this local slice will store the result of the gradient operation
    grey2.create(sendcount/sizes[1],sizes[1],CV_8U);
    /*
for(int y = 0; y < grey.rows; y++)
    for(int x = 0; x < grey.cols; x++)
        grey.at<uchar>(y,x) = 0;  
     */
    int starty=(rank*sizes[0]/size);
    if(starty==0)
    {starty=1;}
    int stopy=((rank+1)*sizes[0]/size);
    if(stopy>sizes[0] - 1)
    {stopy=sizes[0] - 1;}
    int ii=grey3.cols;
    uchar* data=grey3.data;
    uchar* datad=grey2.data;
    for(int y = starty; y < stopy; y++)
    {
        ii++;
        for(int x = 1; x < sizes[1] - 1; x++)
        {
            //gx = xGradient(grey, x, y);
            gx=xGradientd(&data[ii],grey2.cols);
            gy=yGradientd(&data[ii],grey2.cols);
            //gy = yGradient(grey, x, y);
            //printf("%d %d \n",gx,gy);
            sum = abs(gx) + abs(gy);
            //cout<<sum<<endl;
            sum = sum > 255 ? 255:sum;
            sum = sum < 0 ? 0 : sum;
            datad[ii] = sum;
            //cout<<sum<<endl;
            ii++;
        }
        ii++;
    }
    //namedWindow("deteksi tepi sobel");
    //imshow("deteksi tepi sobel", dst);
    //namedWindow("grayscale");
    //imshow("grayscale", grey);
    //namedWindow("Original");
    //imshow("Original", src);
    /*
sprintf(name,"jarigrad%d.jpg",rank);
imwrite(name, grey2 );
     */
    // now, the data in grey2 should be sent from every processor in image grey on proc 0
    //MPI_Gatherv will be used.
    //on proc 0, count of bytes to be received from each processor should be computed
    // as well as displacements representing where each part should be placed in image grey
    if(rank==0){
        displ[0]=0;
        for(i=0;i<size;i++){
            recvcount[i]=grey.cols*(grey.rows/size);
            if(i>0){
                displ[i]=recvcount[i-1]+displ[i-1];
            }
        }
        recvcount[size-1]=grey.cols*(grey.rows-(size-1)*(grey.rows/size));
        if(size-1>0){
            displ[size-1]=recvcount[size-2]+displ[size-2];
        }
    }
    //on each processor, how many lines should be sent ? sendcount.
    //where does the data in grey2 starts ? tosend.
    sendcount=sizes[1]*(sizes[0]/size);
    if(rank==size-1){
        sendcount=sizes[1]*(sizes[0]-(size-1)*(sizes[0]/size));
    }
    uchar* tosend=&grey2.data[grey2.cols];
    if(rank==0){
        tosend=&grey2.data[0];
    }
    MPI_Gatherv(tosend,sendcount , MPI_CHAR,grey.data, recvcount, displ,MPI_CHAR, 0, MPI_COMM_WORLD);
    grey2.release();
    //everything is back on proc 0 in image grey
    end=MPI_Wtime();
    if(rank==0){
        imwrite("output.jpg", grey );
        cout<<"time: "<< end-start << " detik " <<endl;
        grey.release();
    }
    akhir=MPI_Finalize();
    //waitKey();
    return 0;
}
Bye,
Francis