Just to provide some empirical evidence for these theoretical arguments:
Here is a test case where several threads use xadd to increment a shared counter.  On an i7-8565U with 4 cores, it outputs
unlocked: counter = 1633267, expected 4000000
locked: counter = 4000000, expected 4000000
which clearly shows that xadd without lock is NOT atomic.
The code:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <pthread.h>
unsigned long counter = 0;
#define COUNTS_PER_THREAD 1000000UL
#define THREADS 4
void *unlocked_worker(void *unused) {
    (void)unused;
    for (unsigned long i = 0; i < COUNTS_PER_THREAD; i++) {
        unsigned long inc = 1;
        asm volatile("xaddq %0, %1" : "+r" (inc), "+m" (counter));
    }
    return NULL;
}
void *locked_worker(void *unused) {
    (void)unused;
    for (unsigned long i = 0; i < COUNTS_PER_THREAD; i++) {
        unsigned long inc = 1;
        asm volatile("lock; xaddq %0, %1" : "+r" (inc), "+m" (counter));
    }
    return NULL;
}
void run_threads(int lock) {
    void *(*worker)(void *) = lock ? locked_worker : unlocked_worker;
    counter = 0;
    pthread_t th[THREADS];
    for (int i = 0; i < THREADS; i++) {
        int err = pthread_create(&th[i], NULL, worker, NULL);
        if (err != 0) {
            fprintf(stderr, "pthread_create: %s\n", strerror(err));
            exit(1);
        }
    }
    for (int i = 0; i < THREADS; i++) {
        int err = pthread_join(th[i], NULL);
        if (err != 0) {
            fprintf(stderr, "pthread_join: %s\n", strerror(err));
            exit(1);
        }
    }
    printf("%s: counter = %lu, expected %lu\n",
           lock ? "locked" : "unlocked",
           counter, COUNTS_PER_THREAD * THREADS);
}
int main(void) {
    run_threads(0);
    run_threads(1);
    return 0;
}