I am trying to write an implementation of std::vector to learn C++ and my implementation is slower than std::vector (see output). 
I am wondering how I can improve it from any C++ experts. I saw this question (Why is std::vector so fast ( or is my implementation is too slow )) but his problem didn't help as the poster was using the wrong data structure.
I am asking how I can get it faster than std::vector. 
vector.h
template <typename T>
class Vector {
public:
    explicit Vector(const int n);
    explicit Vector(const int n, const T& val);
    T& operator[](const int i);
    inline int const length();
    inline void fill(const T& val);
private:
    T* arr;
    int len;
};
vector.cpp
#include "vector.h"
#include <iostream>
#include <algorithm>
using namespace std;
template <typename T>
inline void Vector<T>::fill(const T& val)
{
    for (int i = 0; i < len; ++i) {
        arr[i] = val;
    }
}
template <typename T>
inline T& Vector<T>::sum()
{
    T total = 0;
    for (int i = 0; i < len; ++i) {
        total += arr[i];
    }
    return total;
}
template <typename T>
Vector<T>::Vector(const int n) : arr(new T[n]()), len(n)
{
    //cout << "Vector(n)" <<'\n';
}
template <typename T>
Vector<T>::Vector(const int n, const T& val) : arr(new T[n]), len(n)
{
    //cout << "Vector(n, val)" <<'\n';
    for (int i = 0; i < len; ++i) {
        arr[i] = val;
    }
}
template <typename T>
T& Vector<T>::operator[](const int i)
{
    return arr[i];
}
template <typename T>
int const Vector<T>::length()
{
    return len;
}
template class Vector<int>;
template class Vector<float>;
vector_test.cpp
#include "vector.h"
#include <iostream>
#include <chrono>
#include <vector>
using namespace std;
int main() 
{
    const int n = 2000000;
    float sum = 0;
    chrono::steady_clock::time_point start = chrono::steady_clock::now();   
    Vector<float> vec(n, 1);
    sum = vec.sum();
    chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
    cout << "my vec sum = " << sum << '\n';
    cout << "my vec impl took " << chrono::duration_cast<chrono::microseconds>(end - start).count()
              << "us.\n";
    sum = 0;
    start = chrono::steady_clock::now();
    vector<float> vec2(n, 1);
    for (int i = 0; i < n; ++i) {
        sum += vec2[i];
    }
    end = std::chrono::steady_clock::now();
    cout << "std::vec sum = " << sum << '\n';
    cout << "stl::vec impl took " << chrono::duration_cast<chrono::microseconds>(end - start).count()
              << "us.\n";
}
Output:
my vec sum = 2e+06
my vec impl took 11040us.
std::vec sum = 2e+06
stl::vec impl took 8034us.
 
     
    