Sample output:
2017-12-20T16:24:47,604144+01:00 Hello, World!
Code (with put_printf usage demonstrated in put_timestamp):
#include <assert.h>
#include <chrono>
#include <iomanip>
#include <iostream>
class put_printf {
    static constexpr size_t failed = std::numeric_limits<size_t>::max(); // for any explicit error handling
    size_t stream_size; // excluding '\0'; on error set to 0 or to "failed"
    char buf_stack[2048+1]; // MAY be any size that fits on the stack (even 0), SHOULD be (just) large enough for most uses (including '\0')
    std::unique_ptr<char[]> buf_heap; // only used if the output doesn't fit in buf_stack
public:
    explicit put_printf(const char *format, ...)
            #if __GNUC__
            __attribute__ ((format (printf, 2, 3))) // most compelling reason for not using a variadic template; parameter 1 is implied "this"
            #endif
            {
        va_list args;
        va_start(args, format);
        const int res = vsnprintf(buf_stack, sizeof(buf_stack), format, args);
        va_end(args);
        if (res < 0) { // easily provoked, e.g., with "%02147483646i\n", i.e., more than INT_MAX-1 significant characters (only observed, no guarantee seen)
            stream_size = failed;
        } else if (res < sizeof(buf_stack)) { // preferred path
            stream_size = res;
        } else { // not artificially constrained
            try {
                const size_t buf_size = static_cast<size_t>(res) + 1; // avoids relying on "res < INT_MAX" (only observed, no guarantee seen)
                buf_heap.reset(new char[buf_size]); // observed to work even beyond INT_MAX=2^32-1 bytes
                va_start(args, format);
                if (vsnprintf(buf_heap.get(), buf_size, format, args) == res) stream_size = res;
                else stream_size = failed; // can't happen
                va_end(args);
            } catch (const std::bad_alloc&) { // insufficient free heap space (or an environment-specific constraint?)
                stream_size = failed;
            }
        }
    }
    friend std::ostream& operator<<(std::ostream& os, const put_printf& self) {
        if (self.stream_size == failed) {
            // (placeholder for any explicit error handling)
            return os;
        } else {
            // using write() rather than operator<<() to avoid a separate scan for '\0' or unintentional truncation at any internal '\0' character
            return os.write((self.buf_heap ? self.buf_heap.get() : self.buf_stack), self.stream_size);
        }
    }
};
class put_timestamp {
    const bool basic = false;
    const bool local = true;
public:
    friend std::ostream& operator<<(std::ostream& os, const put_timestamp& self) {
        const auto now = std::chrono::system_clock::now();
        const std::time_t now_time_t = std::chrono::system_clock::to_time_t(now);
        struct tm tm; if ((self.local ? localtime_r(&now_time_t, &tm) : gmtime_r(&now_time_t, &tm)) == nullptr) return os; // TODO: explicit error handling?
        static_assert(4 <= sizeof(int), "");
        const int microseconds = std::chrono::duration_cast<std::chrono::microseconds>(now.time_since_epoch() % std::chrono::seconds(1)).count();
        assert(0 <= microseconds && microseconds < 1000000); // TODO: (how) do we know?
        // TODO: doesn't "point" in "decimal_point()" imply "dot"/"full stop"/"period", unlike an obviously neutral term like "mark"/"separator"/"sign"?
        const char decimal_sign = std::use_facet<std::numpunct<char>>(os.getloc()).decimal_point() == '.' ? '.' : ','; // full stop accepted, comma preferred
        // TODO: all well and good for a locale-specific decimal sign, but couldn't the locale also upset microseconds formatting by grouping digits?
        os << std::put_time(&tm, self.basic ? "%Y%m%dT%H%M%S" : "%FT%T") << put_printf("%c%06i", decimal_sign, microseconds);
        if (! self.local) return os << "Z";
        const int tz_minutes = std::abs(static_cast<int>(tm.tm_gmtoff)) / 60;
        return os << put_printf(self.basic ? "%c%02i%02i" : "%c%02i:%02i", 0 <= tm.tm_gmtoff ? '+' : '-', tz_minutes / 60, tz_minutes % 60);
    }
};
int main() {
    // testing decimal sign
    ///std::cout.imbue(std::locale("en_GB"));
    ///std::cout.imbue(std::locale("fr_FR"));
    std::cout << put_timestamp() << " Hello, World!\n";
    #if 0
    typedef put_printf pf; // just to demo local abbreviation
    std::cout << "1: " << pf("%02147483646i\n"  , 1     ) << std::endl; // res < 0
    std::cout << "2: " << pf("%02147483643i%i\n", 1, 100) << std::endl; // res < 0
    std::cout << "3: " << pf("%02147483643i%i\n", 1,  10) << std::endl; // works
    std::cout << "4: " << pf("%02147483646i"    , 1     ) << std::endl; // works
    #endif
    return 0;
}
Comments about put_printf:
// Reasons for the name "put_printf" (and not "putf" after all):
// - put_printf is self-documenting, while using the naming pattern also seen in std::put_time;
// - it is not clear whether the proposed std::putf would support exactly the same format syntax;
// - it has a niche purpose, so a longer name is not an objection, and for frequent local uses
//     it is easy enough to declare an even shorter "typedef put_printf pf;" or so.
// Evaluation of delegating to vsnprintf() with intermediate buffer:
// (+) identical result without implementation and/or maintenance issues,
// (?) succeeds or fails as a whole, no output of successful prefix before point of failure
// (-) (total output size limited to INT_MAX-1)
// (-) overhead (TODO: optimal buf_stack size considering cache and VM page locality?)
// Error handling (an STL design problem?):
// - std::cout.setstate(std::ios_base::failbit) discards further std::cout output (stdout still works),
//     so, to be aware of an error in business logic yet keep on trucking in diagnostics,
//     should there be separate classes, or a possibility to plug in an error handler, or what?
// - should the basic or default error handling print a diagnostic message? throw an exception?
// TODO: could a function "int ostream_printf(std::ostream& os, const char *format, ...)"
//           first try to write directly into os.rdbuf() before using buf_stack and buf_heap,
//           and would that significantly improve performance or not?