diff --git a/libvmaf/include/libvmaf/libvmaf_cuda.h b/libvmaf/include/libvmaf/libvmaf_cuda.h index ac34c24a3..c7fb5adab 100644 --- a/libvmaf/include/libvmaf/libvmaf_cuda.h +++ b/libvmaf/include/libvmaf/libvmaf_cuda.h @@ -102,6 +102,17 @@ int vmaf_cuda_preallocate_pictures(VmafContext *vmaf, */ int vmaf_cuda_fetch_preallocated_picture(VmafContext *vmaf, VmafPicture* pic); +/** + * Synchronizes all CUDA feature extractors within the VmafContext + * with the CPU using their flush function. All feature scores will + * be written when this function returns. + * + * @param vmaf VMAF context allocated with `vmaf_init()` and + * initialized with `vmaf_cuda_preallocate_pictures()`. + * @return 0 on success, or < 0 (a negative errno code) on error. + */ +int vmaf_cuda_fex_synchronize(VmafContext *vmaf); + #ifdef __cplusplus } #endif diff --git a/libvmaf/src/feature/cuda/integer_adm_cuda.c b/libvmaf/src/feature/cuda/integer_adm_cuda.c index d8b414436..52f23fc32 100644 --- a/libvmaf/src/feature/cuda/integer_adm_cuda.c +++ b/libvmaf/src/feature/cuda/integer_adm_cuda.c @@ -31,8 +31,10 @@ #include "cuda/integer_adm_cuda.h" #include "picture_cuda.h" #include - #include +#ifdef HAVE_NVTX +#include "nvtx3/nvToolsExt.h" +#endif #define RES_BUFFER_SIZE 4 * 3 * 2 @@ -55,7 +57,7 @@ typedef struct AdmStateCuda { int dst_stride, CUstream c_stream); CUstream str, host_stream; void* write_score_parameters; - CUevent ref_event, dis_event, finished; + CUevent ref_event, dis_event, finished, scores_written; VmafDictionary *feature_name_dict; // adm_dwt kernels @@ -642,7 +644,6 @@ typedef struct write_score_parameters_adm { static int write_scores(write_score_parameters_adm* params) { - VmafFeatureCollector *feature_collector = params->feature_collector; AdmStateCuda *s = params->s; unsigned index = params->index; @@ -715,7 +716,11 @@ static int write_scores(write_score_parameters_adm* params) s->feature_name_dict, "integer_adm_scale3", scores[6] / scores[7], index); - if (!s->debug) return err; + if (!s->debug) { + + return err; + } + err |= vmaf_feature_collector_append_with_dict(feature_collector, s->feature_name_dict, "integer_adm", score, index); @@ -749,7 +754,6 @@ static int write_scores(write_score_parameters_adm* params) err |= vmaf_feature_collector_append_with_dict(feature_collector, s->feature_name_dict, "integer_adm_den_scale3", scores[7], index); - return err; } @@ -1015,9 +1019,10 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); CHECK_CUDA(cuStreamCreateWithPriority(&s->str, CU_STREAM_NON_BLOCKING, 0)); CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuEventCreate(&s->ref_event, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuEventCreate(&s->dis_event, CU_EVENT_DEFAULT)); + CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->ref_event, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->dis_event, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->scores_written, CU_EVENT_DISABLE_TIMING)); CUmodule adm_cm_module, adm_csf_den_module, adm_csf_module, adm_decouple_module, adm_dwt_module; @@ -1155,10 +1160,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, // this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores CHECK_CUDA(cuStreamSynchronize(s->str)); - // CHECK_CUDA(cuEventSynchronize(s->finished)); CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); - CHECK_CUDA(cuEventDestroy(s->finished)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); CHECK_CUDA(cuCtxPopCurrent(NULL)); // current implementation is limited by the 16-bit data pipeline, thus @@ -1179,6 +1181,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, data->w = ref_pic->w[0]; CHECK_CUDA(cuStreamWaitEvent(s->host_stream, s->finished, CU_EVENT_WAIT_DEFAULT)); CHECK_CUDA(cuLaunchHostFunc(s->host_stream, (CUhostFn)write_scores, data)); + CHECK_CUDA(cuEventRecord(s->scores_written, s->host_stream)); return 0; } @@ -1215,15 +1218,32 @@ static int close_fex_cuda(VmafFeatureExtractor *fex) if (s->write_score_parameters) free(s->write_score_parameters); ret |= vmaf_dictionary_free(&s->feature_name_dict); + + if(s->ref_event) CHECK_CUDA(cuEventDestroy(s->ref_event)); + if(s->dis_event) CHECK_CUDA(cuEventDestroy(s->dis_event)); + if(s->finished) CHECK_CUDA(cuEventDestroy(s->finished)); + if(s->scores_written) CHECK_CUDA(cuEventDestroy(s->scores_written)); + + //cuStreamDestroy(s->str); + return ret; } static int flush_fex_cuda(VmafFeatureExtractor *fex, VmafFeatureCollector *feature_collector) { +#ifdef HAVE_NVTX + nvtxRangePushA("flush adm_cuda"); +#endif AdmStateCuda *s = fex->priv; + int ret = 0; CHECK_CUDA(cuStreamSynchronize(s->str)); - return 1; + CHECK_CUDA(cuStreamSynchronize(s->host_stream)); + CHECK_CUDA(cuEventSynchronize(s->scores_written)); +#ifdef HAVE_NVTX + nvtxRangePop(); +#endif + return (ret < 0) ? ret : !ret; } static const char *provided_features[] = { diff --git a/libvmaf/src/feature/cuda/integer_motion_cuda.c b/libvmaf/src/feature/cuda/integer_motion_cuda.c index 9615cde98..3998bc42c 100644 --- a/libvmaf/src/feature/cuda/integer_motion_cuda.c +++ b/libvmaf/src/feature/cuda/integer_motion_cuda.c @@ -31,9 +31,12 @@ #include "picture.h" #include "picture_cuda.h" #include "cuda_helper.cuh" +#ifdef HAVE_NVTX +#include "nvtx3/nvToolsExt.h" +#endif typedef struct MotionStateCuda { - CUevent event, finished; + CUevent event, finished, scores_written; CUfunction funcbpc8, funcbpc16; CUstream str, host_stream; VmafCudaBuffer* blur[2]; @@ -44,6 +47,8 @@ typedef struct MotionStateCuda { double score; bool debug; bool motion_force_zero; + bool flushed; + bool closed; void (*calculate_motion_score)(const VmafPicture* src, VmafCudaBuffer* src_blurred, const VmafCudaBuffer* prev_blurred, VmafCudaBuffer* sad, unsigned width, unsigned height, @@ -136,12 +141,15 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt unsigned bpc, unsigned w, unsigned h) { MotionStateCuda *s = fex->priv; + s->flushed = true; + s->closed = false; CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); CHECK_CUDA(cuStreamCreateWithPriority(&s->str, CU_STREAM_NON_BLOCKING, 0)); CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0)); - CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); + CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->scores_written, CU_EVENT_BLOCKING_SYNC)); CUmodule module; CHECK_CUDA(cuModuleLoadData(&module, src_motion_score_ptx)); @@ -202,20 +210,39 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt return -ENOMEM; } +// if called twice in a row, finalize FEX and close static int flush_fex_cuda(VmafFeatureExtractor *fex, VmafFeatureCollector *feature_collector) { +#ifdef HAVE_NVTX + nvtxRangePushA("flush motion_cuda"); +#endif + MotionStateCuda *s = fex->priv; int ret = 0; - CHECK_CUDA(cuStreamSynchronize(s->str)); - CHECK_CUDA(cuStreamSynchronize(s->host_stream)); - - if (s->index > 0) { - ret = vmaf_feature_collector_append(feature_collector, - "VMAF_integer_feature_motion2_score", - s->score, s->index); + if(!s->flushed) { + CHECK_CUDA(cuStreamSynchronize(s->str)); + CHECK_CUDA(cuStreamSynchronize(s->host_stream)); + while (cuEventQuery(s->scores_written) != CUDA_SUCCESS) + { + continue; + } + CHECK_CUDA(cuEventSynchronize(s->scores_written)); + + } + else { + if (s->index > 0 && !s->closed) { + ret = vmaf_feature_collector_append(feature_collector, + "VMAF_integer_feature_motion2_score", + s->score, s->index); + } + s->closed = true; } + s->flushed = true; +#ifdef HAVE_NVTX + nvtxRangePop(); +#endif return (ret < 0) ? ret : !ret; } @@ -242,7 +269,7 @@ static int write_scores(write_score_parameters_moco* params) } if (err) return err; - if (params->index == 1) + if (params->index == 1) return 0; err = vmaf_feature_collector_append(feature_collector, @@ -257,13 +284,14 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic, VmafFeatureCollector *feature_collector) { MotionStateCuda *s = fex->priv; - + if(s->closed) { + return -ESHUTDOWN; // TODO: proper error code here + } + s->flushed = false; // this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores CHECK_CUDA(cuStreamSynchronize(s->str)); - // CHECK_CUDA(cuEventSynchronize(s->finished)); + CHECK_CUDA(cuEventSynchronize(s->finished)); CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); - CHECK_CUDA(cuEventDestroy(s->finished)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); CHECK_CUDA(cuCtxPopCurrent(NULL)); int err = 0; @@ -286,10 +314,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic, CHECK_CUDA(cuEventRecord(s->event, vmaf_cuda_picture_get_stream(ref_pic))); // This event ensures the input buffer is consumed CHECK_CUDA(cuStreamWaitEvent(s->str, s->event, CU_EVENT_WAIT_DEFAULT)); - CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); - CHECK_CUDA(cuEventDestroy(s->event)); - CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuCtxPopCurrent(NULL)); + if (index == 0) { err = vmaf_feature_collector_append(feature_collector, @@ -311,11 +336,13 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic, CHECK_CUDA(cuStreamWaitEvent(s->host_stream, s->finished, CU_EVENT_WAIT_DEFAULT)); write_score_parameters_moco* params = s->write_score_parameters; + cuEventSynchronize(s->scores_written); params->feature_collector = feature_collector; params->h = ref_pic->h[0]; params->w = ref_pic->w[0]; params->index = index; CHECK_CUDA(cuLaunchHostFunc(s->host_stream, write_scores, s->write_score_parameters)); + CHECK_CUDA(cuEventRecord(s->scores_written, s->host_stream)); return 0; } @@ -342,6 +369,12 @@ static int close_fex_cuda(VmafFeatureExtractor *fex) if(s->write_score_parameters) { free(s->write_score_parameters); } + + + if(s->event) CHECK_CUDA(cuEventDestroy(s->event)); + if(s->finished) CHECK_CUDA(cuEventDestroy(s->finished)); + if(s->scores_written) CHECK_CUDA(cuEventDestroy(s->scores_written)); + return ret; } diff --git a/libvmaf/src/feature/cuda/integer_vif_cuda.c b/libvmaf/src/feature/cuda/integer_vif_cuda.c index fcf68f442..c43b81706 100644 --- a/libvmaf/src/feature/cuda/integer_vif_cuda.c +++ b/libvmaf/src/feature/cuda/integer_vif_cuda.c @@ -33,6 +33,10 @@ #include "cuda/integer_vif_cuda.h" #include "picture_cuda.h" +#ifdef HAVE_NVTX +#include "nvtx3/nvToolsExt.h" +#endif + #if ARCH_X86 #include "x86/vif_avx2.h" #if HAVE_AVX512 @@ -42,7 +46,7 @@ typedef struct VifStateCuda { VifBufferCuda buf; - CUevent event, finished; + CUevent event, finished, scores_written; CUstream str, host_stream; bool debug; double vif_enhn_gain_limit; @@ -99,9 +103,9 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); CHECK_CUDA(cuStreamCreateWithPriority(&s->str, CU_STREAM_NON_BLOCKING, 0)); CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0)); - CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); - + CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->scores_written, CU_EVENT_DISABLE_TIMING)); // make this static CUmodule filter1d_module; CHECK_CUDA(cuModuleLoadData(&filter1d_module, src_filter1d_ptx)); @@ -453,8 +457,6 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, // before the GPU has finished writing to it. CHECK_CUDA(cuStreamSynchronize(s->str)); CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); - CHECK_CUDA(cuEventDestroy(s->finished)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); CHECK_CUDA(cuCtxPopCurrent(NULL)); CHECK_CUDA(cuMemsetD8Async(s->buf.accum_data->data, 0, sizeof(vif_accums) * 4, s->str)); @@ -478,7 +480,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, CHECK_CUDA(cuStreamWaitEvent(s->str, s->event, CU_EVENT_WAIT_DEFAULT)); CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); CHECK_CUDA(cuEventDestroy(s->event)); - CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT)); + CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DISABLE_TIMING)); CHECK_CUDA(cuCtxPopCurrent(NULL)); } } @@ -496,7 +498,8 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, write_score_parameters_vif *data = s->buf.cpu_param_buf; data->feature_collector = feature_collector; data->index = index; - CHECK_CUDA(cuLaunchHostFunc(s->str, write_scores, data)); + CHECK_CUDA(cuLaunchHostFunc(s->host_stream, (CUhostFn)write_scores, data)); + CHECK_CUDA(cuEventRecord(s->scores_written, s->host_stream)); return 0; } @@ -517,6 +520,11 @@ static int close_fex_cuda(VmafFeatureExtractor *fex) if (s->buf.accum_host) { ret |= vmaf_cuda_buffer_host_free(fex->cu_state, s->buf.accum_host); } + + if(s->event) CHECK_CUDA(cuEventDestroy(s->event)); + if(s->finished) CHECK_CUDA(cuEventDestroy(s->finished)); + if(s->scores_written) CHECK_CUDA(cuEventDestroy(s->scores_written)); + ret |= vmaf_dictionary_free(&s->feature_name_dict); return ret; } @@ -524,10 +532,20 @@ static int close_fex_cuda(VmafFeatureExtractor *fex) static int flush_fex_cuda(VmafFeatureExtractor *fex, VmafFeatureCollector *feature_collector) { +#ifdef HAVE_NVTX + nvtxRangePushA("flush vif_cuda"); +#endif VifStateCuda *s = fex->priv; + int ret = 0; CHECK_CUDA(cuStreamSynchronize(s->str)); - return 1; + CHECK_CUDA(cuStreamSynchronize(s->host_stream)); + CHECK_CUDA(cuEventSynchronize(s->scores_written)); +#ifdef HAVE_NVTX + nvtxRangePop(); +#endif + + return (ret < 0) ? ret : !ret; } static const char *provided_features[] = { diff --git a/libvmaf/src/libvmaf.c b/libvmaf/src/libvmaf.c index 3fbb050c5..560735666 100644 --- a/libvmaf/src/libvmaf.c +++ b/libvmaf/src/libvmaf.c @@ -362,6 +362,20 @@ int vmaf_use_features_from_model_collection(VmafContext *vmaf, return err; } + +int vmaf_cuda_fex_synchronize(VmafContext *vmaf) { + if(!vmaf) return -EINVAL; + int err = 0; + RegisteredFeatureExtractors rfe = vmaf->registered_feature_extractors; + for (unsigned i = 0; i < rfe.cnt; i++) { + if ((rfe.fex_ctx[i]->fex->flags & VMAF_FEATURE_EXTRACTOR_CUDA)) + err |= vmaf_feature_extractor_context_flush(rfe.fex_ctx[i], + vmaf->feature_collector); + } + + return err; +} + struct ThreadData { VmafFeatureExtractorContext *fex_ctx; VmafPicture ref, dist; @@ -492,6 +506,7 @@ static int flush_context(VmafContext *vmaf) } #ifdef HAVE_CUDA + vmaf_cuda_fex_synchronize(vmaf); if (vmaf->cuda.state.ctx) { RegisteredFeatureExtractors rfe = vmaf->registered_feature_extractors; for (unsigned i = 0; i < rfe.cnt; i++) { @@ -747,6 +762,16 @@ int vmaf_score_at_index(VmafContext *vmaf, VmafModel *model, double *score, if (err) { err = vmaf_predict_score_at_index(model, vmaf->feature_collector, index, score, true, 0); + // if(err) { + // // Error? Sync and try again + // vmaf_cuda_fex_synchronize(vmaf); + // err = vmaf_predict_score_at_index(model, vmaf->feature_collector, index, + // score, true, 0); + // if(err == 0) { + // // No error - got score + // return 0; + // } + // } } return err; @@ -775,6 +800,8 @@ int vmaf_feature_score_pooled(VmafContext *vmaf, const char *feature_name, if (index_low > index_high) return -EINVAL; if (!pool_method) return -EINVAL; + // vmaf_cuda_fex_synchronize(vmaf); + unsigned pic_cnt = 0; double min = 0., max = 0., sum = 0., i_sum = 0.; for (unsigned i = index_low; i <= index_high; i++) {