FFMS · dwbuiten · Nov 18, 2024 · Jun 10, 2024
diff --git a/configure.ac b/configure.ac
@@ -6,7 +6,7 @@ AM_INIT_AUTOMAKE([1.11 subdir-objects])
 AM_SILENT_RULES([yes])
 AM_MAINTAINER_MODE([disable])
 
-VERSION_INFO="5:0:0"
+VERSION_INFO="5:1:0"
 
 AC_MSG_CHECKING([if debug build is enabled])
 
@@ -88,7 +88,7 @@ PKG_PROG_PKG_CONFIG([0.22])
 pkgconfigdir="\$(libdir)/pkgconfig"
 AC_SUBST(pkgconfigdir)
 
-PKG_CHECK_MODULES(FFMPEG, [libavformat >= 60.16.0 libavcodec >= 60.31.0 libswscale >= 7.5.0 libavutil >= 58.29.0 libswresample >= 4.12.0])
+PKG_CHECK_MODULES(FFMPEG, [libavformat >= 61.7.0 libavcodec >= 61.19.0 libswscale >= 8.3.0 libavutil >= 59.39.0 libswresample >= 5.3.0])
 
 dnl As of 0eec06ed8747923faa6a98e474f224d922dc487d ffmpeg only adds -lrt to lavc's
 dnl LIBS, but lavu needs it, so move it to the end if it's present

diff --git a/doc/ffms2-changelog.md b/doc/ffms2-changelog.md
@@ -1,4 +1,8 @@
 # FFmpegSource2 Changelog
+- 5.1
+  - FFmpeg 7.1 is now the minimum requirement.
+  - Added layered decoding support, for e.g. spatial MV-HEVC.
+
 - 5.0
   - Fixed all issues with FFmpeg 6.1 which is now the minimum requirement
   - Fixed av1 decoding

diff --git a/include/ffms.h b/include/ffms.h
@@ -22,7 +22,7 @@
 #define FFMS_H
 
 // Version format: major - minor - micro - bump
-#define FFMS_VERSION ((5 << 24) | (0 << 16) | (0 << 8) | 0)
+#define FFMS_VERSION ((5 << 24) | (1 << 16) | (0 << 8) | 0)
 
 #include <stdint.h>
 #include <stddef.h>
@@ -344,6 +344,18 @@ typedef struct FFMS_Frame {
     /* Introduced in FFMS_VERSION ((3 << 24) | (1 << 16) | (1 << 8) | 0) */
     uint8_t *HDR10Plus;
     int HDR10PlusSize;
+
+    /*
+     * If these buffers are not NULL, left and right eye data is present.
+     * In such a case, the main buffer points to the primary eye's buffer,
+     * for use in monoscopic encoding.
+     *
+     * Introduced in FFMS_VERSION ((5 << 24) | (1 << 16) | (0 << 8) | 0)
+     */
+    const uint8_t *LeftEyeData[4];
+    int LeftEyeLinesize[4];
+    const uint8_t *RightEyeData[4];
+    int RightEyeLinesize[4];
 } FFMS_Frame;
 
 typedef struct FFMS_TrackTimeBase {

diff --git a/src/core/videosource.cpp b/src/core/videosource.cpp
@@ -97,6 +97,32 @@ FFMS_Frame *FFMS_VideoSource::OutputFrame(AVFrame *Frame) {
         }
     }
 
+    if (IsLayered) {
+        if ((PrimaryEyeIsLeft && !EyesInverted) || (!PrimaryEyeIsLeft && EyesInverted)) {
+            for (int i = 0; i < 4; i++) {
+                LocalFrame.Data[i] = LeftEyeFrameData[i];
+                LocalFrame.Linesize[i] = LeftEyeLinesize[i];
+            }
+        } else {
+            for (int i = 0; i < 4; i++) {
+                LocalFrame.Data[i] = RightEyeFrameData[i];
+                LocalFrame.Linesize[i] = RightEyeLinesize[i];
+            }
+        }
+        for (int i = 0; i < 4; i++) {
+            LocalFrame.LeftEyeData[i] = LeftEyeFrameData[i];
+            LocalFrame.LeftEyeLinesize[i] = LeftEyeLinesize[i];
+            LocalFrame.RightEyeData[i] = RightEyeFrameData[i];
+            LocalFrame.RightEyeLinesize[i] = RightEyeLinesize[i];
+        }
+        if (EyesInverted) {
+            for (int i = 0; i < 4; i++) {
+                std::swap(LocalFrame.RightEyeData[i], LocalFrame.LeftEyeData[i]);
+                std::swap(LocalFrame.RightEyeLinesize[i], LocalFrame.LeftEyeLinesize[i]);
+            }
+        }
+    }
+
     LocalFrame.EncodedWidth = Frame->width;
     LocalFrame.EncodedHeight = Frame->height;
     LocalFrame.EncodedPixelFormat = Frame->format;
@@ -258,10 +284,31 @@ FFMS_VideoSource::FFMS_VideoSource(const char *SourceFile, FFMS_Index &Index, in
         if (CodecContext->codec_id == AV_CODEC_ID_H264 && CodecContext->has_b_frames)
             CodecContext->has_b_frames = 15; // the maximum possible value for h264
 
-        if (avcodec_open2(CodecContext, Codec, nullptr) < 0)
+        // Are we layered?
+        if (FormatContext->streams[VideoTrack]->disposition & AV_DISPOSITION_MULTILAYER) {
+            IsLayered = true;
+            // See if we can figure out the primary (base) eye based on side data
+            for (int i = 0; i < FormatContext->streams[VideoTrack]->codecpar->nb_coded_side_data; i++) {
+                if (FormatContext->streams[VideoTrack]->codecpar->coded_side_data[i].type == AV_PKT_DATA_STEREO3D) {
+                    const AVStereo3D *StereoSideData = (const AVStereo3D *)FormatContext->streams[VideoTrack]->codecpar->coded_side_data[i].data;
+                    // If 'right', set it as such, otherwise it is left.
+                    PrimaryEyeIsLeft = !(StereoSideData->primary_eye == AV_PRIMARY_EYE_RIGHT);
+                    EyesInverted = !!(StereoSideData->flags & AV_STEREO3D_FLAG_INVERT);
+                }
+            }
+        }
+
+        // Just ask for all views possible if we're layered
+        AVDictionary *CodecDict = nullptr;
+        if (IsLayered)
+            av_dict_set(&CodecDict, "view_ids", "-1", 0);
+
+        if (avcodec_open2(CodecContext, Codec, &CodecDict) < 0)
             throw FFMS_Exception(FFMS_ERROR_DECODING, FFMS_ERROR_CODEC,
                 "Could not open video codec");
 
+        av_dict_free(&CodecDict);
+
         // Similar yet different to h264 workaround above
         // vc1 simply sets has_b_frames to 1 no matter how many there are so instead we set it to the max value
         // in order to not confuse our own delay guesses later
@@ -674,6 +721,24 @@ bool FFMS_VideoSource::HasPendingDelayedFrames() {
     return false;
 }
 
+void FFMS_VideoSource::CopyEye(AVStereo3DView view) {
+    if (view == AV_STEREO3D_VIEW_LEFT) {
+        av_freep(&LeftEyeFrameData[0]);
+        if (av_image_alloc(LeftEyeFrameData, LeftEyeLinesize, DecodeFrame->width, DecodeFrame->height, (enum AVPixelFormat) DecodeFrame->format, 16) < 0)
+                throw FFMS_Exception(FFMS_ERROR_DECODING, FFMS_ERROR_ALLOCATION_FAILED,
+                    "Could not allocate left eye buffer");
+        av_image_copy(LeftEyeFrameData, LeftEyeLinesize, DecodeFrame->data, DecodeFrame->linesize, (enum AVPixelFormat) DecodeFrame->format, DecodeFrame->width, DecodeFrame->height);
+    } else if (view == AV_STEREO3D_VIEW_RIGHT) {
+        av_freep(&RightEyeFrameData[0]);
+        if (av_image_alloc(RightEyeFrameData, RightEyeLinesize, DecodeFrame->width, DecodeFrame->height, (enum AVPixelFormat) DecodeFrame->format, 16) < 0)
+                throw FFMS_Exception(FFMS_ERROR_DECODING, FFMS_ERROR_ALLOCATION_FAILED,
+                    "Could not allocate right eye buffer");
+        av_image_copy(RightEyeFrameData, RightEyeLinesize, DecodeFrame->data, DecodeFrame->linesize, (enum AVPixelFormat) DecodeFrame->format, DecodeFrame->width, DecodeFrame->height);
+    } else {
+        throw FFMS_Exception(FFMS_ERROR_DECODING, FFMS_ERROR_CODEC, "Layered decode with invalid view.");
+    }
+}
+
 bool FFMS_VideoSource::DecodePacket(AVPacket *Packet) {
     std::swap(DecodeFrame, LastDecodedFrame);
     ResendPacket = false;
@@ -692,6 +757,33 @@ bool FFMS_VideoSource::DecodePacket(AVPacket *Packet) {
 
     Ret = avcodec_receive_frame(CodecContext, DecodeFrame);
     if (Ret == 0) {
+        if (IsLayered) {
+            const AVFrameSideData *sd = av_frame_get_side_data(DecodeFrame, AV_FRAME_DATA_STEREO3D);
+            if (!sd)
+                throw FFMS_Exception(FFMS_ERROR_DECODING, FFMS_ERROR_CODEC,
+                    "Missing Stereo3D for layered decode.");
+
+            const AVStereo3D *stereo3d = (const AVStereo3D *)sd->data;
+            AVStereo3DView first_view = stereo3d->view;
+            CopyEye(stereo3d->view);
+
+            Ret = avcodec_receive_frame(CodecContext, DecodeFrame);
+            if (Ret != 0)
+                throw FFMS_Exception(FFMS_ERROR_DECODING, FFMS_ERROR_CODEC,
+                    "Missing second view for layered decode.");
+
+            sd = av_frame_get_side_data(DecodeFrame, AV_FRAME_DATA_STEREO3D);
+            if (!sd)
+                throw FFMS_Exception(FFMS_ERROR_DECODING, FFMS_ERROR_CODEC,
+                    "Missing Stereo3D for layered decode second layer.");
+
+            stereo3d = (const AVStereo3D *)sd->data;
+            if ((first_view == AV_STEREO3D_VIEW_LEFT && stereo3d->view != AV_STEREO3D_VIEW_RIGHT) ||
+                (first_view == AV_STEREO3D_VIEW_RIGHT && stereo3d->view != AV_STEREO3D_VIEW_LEFT)) {
+                throw FFMS_Exception(FFMS_ERROR_DECODING, FFMS_ERROR_CODEC, "Unmatched left/right views in layered decode.");
+            }
+            CopyEye(stereo3d->view);
+        }
         Delay.Decrement();
     } else {
         std::swap(DecodeFrame, LastDecodedFrame);
@@ -740,6 +832,8 @@ void FFMS_VideoSource::Free() {
     if (SWS)
         sws_freeContext(SWS);
     av_freep(&SWSFrameData[0]);
+    av_freep(&LeftEyeFrameData[0]);
+    av_freep(&RightEyeFrameData[0]);
     av_frame_free(&DecodeFrame);
     av_frame_free(&LastDecodedFrame);
     av_packet_free(&StashedPacket);

diff --git a/src/core/videosource.h b/src/core/videosource.h
@@ -92,6 +92,12 @@ struct FFMS_VideoSource {
 
     uint8_t *SWSFrameData[4] = {};
     int SWSFrameLinesize[4] = {};
+    bool EyesInverted = false;
+    bool PrimaryEyeIsLeft = true;
+    uint8_t *LeftEyeFrameData[4] = {};
+    int LeftEyeLinesize[4] = {};
+    uint8_t *RightEyeFrameData[4] = {};
+    int RightEyeLinesize[4] = {};
 
     AVPacket *StashedPacket = nullptr;
     bool ResendPacket = false;
@@ -118,6 +124,7 @@ struct FFMS_VideoSource {
     int SeekMode;
     bool SeekByPos = false;
     bool HaveSeenInterlacedFrame = false;
+    bool IsLayered = false;
 
     void ReAdjustOutputFormat(AVFrame *Frame);
     FFMS_Frame *OutputFrame(AVFrame *Frame);
@@ -135,6 +142,7 @@ struct FFMS_VideoSource {
     FFMS_Track *GetTrack() { return &Frames; }
     FFMS_Frame *GetFrame(int n);
     void GetFrameCheck(int n);
+    void CopyEye(AVStereo3DView view);
     FFMS_Frame *GetFrameByTime(double Time);
     void SetOutputFormat(const AVPixelFormat *TargetFormats, int Width, int Height, int Resizer);
     void ResetOutputFormat();