As the code shows, times increases by the increase of threads, and decrease by the decrease of threads, optimally at 1.
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
public class MatrixMultiplication {
    public static class Matrix {
        int[][] container;
        int rows;
        int cols;
        public Matrix(int rows, int cols) {
            this.rows = rows;
            this.cols = cols;
            container = new int[rows][cols];
        }
    }
    public static class MultiplyThread implements Runnable {
        private int row;
        private int col;
        Matrix a;
        Matrix b;
        Matrix c;
        public MultiplyThread(Matrix a, Matrix b, Matrix c, int row, int col) {
            this.row = row;
            this.col = col;
            this.a = a;
            this.b = b;
            this.c = c;
        }
        @Override
        public void run() {
            int sum = 0;
            for (int i = 0; i < a.cols; i++) // or b.rows
                sum += a.container[row][i] * b.container[i][col];
            c.container[row][col] = sum;
        }
    }
    public static Matrix multiplyMatrices(Matrix a, Matrix b, int threads) throws InterruptedException {
        if (a.cols != b.rows)
            return null;
        int rows = a.rows;
        int cols = b.cols;
        Matrix ret = new Matrix(rows, cols);
        ExecutorService executor = Executors.newFixedThreadPool(threads);
        int i = 0, j = 0;
        for (i = 0; i < rows; i++)
            for (j = 0; j < cols; j++)
                executor.submit(new MultiplyThread(a, b, ret, i, j));
        executor.shutdown();
        executor.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
        return ret;
    }
    public static void main(String[] args) throws InterruptedException {
        int[][] mat = new int[][] { { 1, 1, 1, 1 }, { 1, 1, 1, 1 }, { 1, 1, 1, 1 }, { 1, 1, 1, 1 } };
        Matrix a = new Matrix(4, 4);
        a.container = mat;
        Matrix b = new Matrix(4, 4);
        b.container = mat;
        int numProcessors = Runtime.getRuntime().availableProcessors();
        System.out.println(numProcessors);
        for (int i = 10; i >= 1; i--) {
            long time = System.currentTimeMillis();
            for (int j = 0; j < 10000; j++)
                multiplyMatrices(a, b, i);
            time = System.currentTimeMillis() - time;
            System.out.println("number of threads: " + i + ", and the time: " + time);
        }
    }
}
As the code shows, times increases by the increase of threads, and decrease by the decrease of threads, optimally at 1.
 
    