Since none of these answers offered an actual benchmark, I thought I'd try to provide one.  However, think I've left myself more confused than when I started.  I tried to come up with a test that would measure passing a shared_ptr<int> by value, by reference, and using std::move, performing an add operation on that value, and returning the result.  I did this several times (one million) using two sets of tests.  The first set added a constant value to the shared_ptr<int>, the other added a random value in the [0, 10] range.  I figured the constant value addition would be a candidate for heavy optimization, whereas the random value test would not.  That is more-or-less what I saw, but the extreme differences in execution time leads me to believe that other factors/problems with this test program are the contributing factors to the execution time differences, not the move semantics.
tl;dr
For no optimizations (-O0), constant addition
- std::movewas ~4x faster than pass-by-value
- std::movewas marginally slower than pass-by-reference
For high optimizations (-O3), constant addition
- std::movewas 70-90 thousand times faster than pass-by-value
- std::movewas marginally faster than pass-by-reference (anywhere from 1-1.4 times)
For no optimizations (-O0), random addition
- std::movewas 1-2 times faster than pass-by-value
- std::movewas marginally slower than pass-by-reference
For high optimizations (-O3), random addition
- std::movewas 1-1.3 times faster than pass-by-value (marginally worse than no optimizations)
- std::movewas essentially the same as pass-by-reference
Finally, the test
#include <memory>
#include <iostream>
#include <chrono>
#include <ctime>
#include <random>
constexpr auto MAX_NUM_ITS = 1000000;
// using random values to try to cut down on massive compiler optimizations
static std::random_device RAND_DEV;
static std::mt19937 RNG(RAND_DEV());
static std::uniform_int_distribution<std::mt19937::result_type> DIST11(0,10);
void CopyPtr(std::shared_ptr<int> myInt)
{
    // demonstrates that use_count increases with each copy
    std::cout << "In CopyPtr: ref count = " << myInt.use_count() << std::endl;
    std::shared_ptr<int> myCopyInt(myInt);
    std::cout << "In CopyPtr: ref count = " << myCopyInt.use_count() << std::endl;
}
void ReferencePtr(std::shared_ptr<int>& myInt)
{
    // reference count stays the same until a copy is made
    std::cout << "In ReferencePtr: ref count = " << myInt.use_count() << std::endl;
    std::shared_ptr<int> myCopyInt(myInt);
    std::cout << "In ReferencePtr: ref count = " << myCopyInt.use_count() << std::endl;
}
void MovePtr(std::shared_ptr<int>&& myInt)
{
    // demonstrates that use_count remains constant with each move
    std::cout << "In MovePtr: ref count = " << myInt.use_count() << std::endl;
    std::shared_ptr<int> myMovedInt(std::move(myInt));
    std::cout << "In MovePtr: ref count = " << myMovedInt.use_count() << std::endl;
}
int CopyPtrFastConst(std::shared_ptr<int> myInt)
{
    return 5 + *myInt;
}
int ReferencePtrFastConst(std::shared_ptr<int>& myInt)
{
    return 5 + *myInt;
}
int MovePtrFastConst(std::shared_ptr<int>&& myInt)
{
    return 5 + *myInt;
}
int CopyPtrFastRand(std::shared_ptr<int> myInt)
{
    return DIST11(RNG) + *myInt;
}
int ReferencePtrFastRand(std::shared_ptr<int>& myInt)
{
    return DIST11(RNG) + *myInt;
}
int MovePtrFastRand(std::shared_ptr<int>&& myInt)
{
    return DIST11(RNG) + *myInt;
}
void RunConstantFunctions(std::shared_ptr<int> myInt)
{
    std::cout << "\nIn constant funcs, ref count = " << myInt.use_count() << std::endl;
    // demonstrates speed of each function
    int sum = 0;
    // Copy pointer
    auto start = std::chrono::steady_clock::now();
    for (auto i=0; i<MAX_NUM_ITS; i++)
    {
        sum += CopyPtrFastConst(myInt);
    }
    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double> copyElapsed = end - start;
    std::cout << "CopyPtrConst sum = " << sum << ", took " << copyElapsed.count() << " seconds.\n";
    // pass pointer by reference
    sum = 0;
    start = std::chrono::steady_clock::now();
    for (auto i=0; i<MAX_NUM_ITS; i++)
    {
        sum += ReferencePtrFastConst(myInt);
    }
    end = std::chrono::steady_clock::now();
    std::chrono::duration<double> refElapsed = end - start;
    std::cout << "ReferencePtrConst sum = " << sum << ", took " << refElapsed.count() << " seconds.\n";
    // pass pointer using std::move
    sum = 0;
    start = std::chrono::steady_clock::now();
    for (auto i=0; i<MAX_NUM_ITS; i++)
    {
        sum += MovePtrFastConst(std::move(myInt));
    }
    end = std::chrono::steady_clock::now();
    std::chrono::duration<double> moveElapsed = end - start;
    std::cout << "MovePtrConst sum = " << sum << ", took " << moveElapsed.count() <<
        " seconds.\n";
    std::cout << "std::move vs pass by value: " << copyElapsed / moveElapsed << " times faster.\n";
    std::cout << "std::move vs pass by ref:   " << refElapsed / moveElapsed << " times faster.\n";
}
void RunRandomFunctions(std::shared_ptr<int> myInt)
{
    std::cout << "\nIn random funcs, ref count = " << myInt.use_count() << std::endl;
    // demonstrates speed of each function
    int sum = 0;
    // Copy pointer
    auto start = std::chrono::steady_clock::now();
    for (auto i=0; i<MAX_NUM_ITS; i++)
    {
        sum += CopyPtrFastRand(myInt);
    }
    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double> copyElapsed = end - start;
    std::cout << "CopyPtrRand sum = " << sum << ", took " << copyElapsed.count() << " seconds.\n";
    // pass pointer by reference
    sum = 0;
    start = std::chrono::steady_clock::now();
    for (auto i=0; i<MAX_NUM_ITS; i++)
    {
        sum += ReferencePtrFastRand(myInt);
    }
    end = std::chrono::steady_clock::now();
    std::chrono::duration<double> refElapsed = end - start;
    std::cout << "ReferencePtrRand sum = " << sum << ", took " << refElapsed.count() << " seconds.\n";
    // pass pointer using std::move
    sum = 0;
    start = std::chrono::steady_clock::now();
    for (auto i=0; i<MAX_NUM_ITS; i++)
    {
        sum += MovePtrFastRand(std::move(myInt));
    }
    end = std::chrono::steady_clock::now();
    std::chrono::duration<double> moveElapsed = end - start;
    std::cout << "MovePtrRand sum = " << sum << ", took " << moveElapsed.count() <<
        " seconds.\n";
    std::cout << "std::move vs pass by value: " << copyElapsed / moveElapsed << " times faster.\n";
    std::cout << "std::move vs pass by ref:   " << refElapsed / moveElapsed << " times faster.\n";
}
int main()
{
    // demonstrates how use counts are effected between copy and move
    std::shared_ptr<int> myInt = std::make_shared<int>(5);
    std::cout << "In main: ref count = " << myInt.use_count() << std::endl;
    CopyPtr(myInt);
    std::cout << "In main: ref count = " << myInt.use_count() << std::endl;
    ReferencePtr(myInt);
    std::cout << "In main: ref count = " << myInt.use_count() << std::endl;
    MovePtr(std::move(myInt));
    std::cout << "In main: ref count = " << myInt.use_count() << std::endl;
    // since myInt was moved to MovePtr and fell out of scope on return (was destroyed),
    // we have to reinitialize myInt
    myInt.reset();
    myInt = std::make_shared<int>(5);
    RunConstantFunctions(myInt);
    RunRandomFunctions(myInt);
    return 0;
}
live version here
I noticed that for -O0 and -O3, the constant functions both compiled to the same assembly for both sets of flags, both relatively short blocks.  This makes me think a majority of the optimization comes from the calling code, but I'm not really seeing that in my amateur assembly knowledge.
The random functions compiled to quite a bit of assembly, even for -O3, so the random part must be dominating that routine.
So in the end, not really sure what to make of this.  Please throw darts at it, tell me what I did wrong, offer some explanations.