Update how VP9 temporal up switch is populated

This CL updates both the static GOF pattern with the correct flags for temporal_up_switch, as well the flexible mode logic to base the flag on dependency descriptors instead use reference buffers. Bug: webrtc:13576 Change-Id: I578f744bec51d1f3531da5f4a89d12f05a16a6c0 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/247187 Reviewed-by: Danil Chapovalov <danilchap@webrtc.org> Commit-Queue: Erik Språng <sprang@webrtc.org> Cr-Commit-Position: refs/heads/main@{#35741}
2022-01-19 14:17:14 +01:00
parent 1ca57b9015
commit 16cbed4782
4 changed files with 58 additions and 37 deletions
--- a/modules/video_coding/codecs/vp9/include/vp9_globals.h
+++ b/modules/video_coding/codecs/vp9/include/vp9_globals.h
@ -46,14 +46,14 @@ struct GofInfoVP9 {
      case kTemporalStructureMode1:
        num_frames_in_gof = 1;
        temporal_idx[0] = 0;
-        temporal_up_switch[0] = false;
+        temporal_up_switch[0] = true;
        num_ref_pics[0] = 1;
        pid_diff[0][0] = 1;
        break;
      case kTemporalStructureMode2:
        num_frames_in_gof = 2;
        temporal_idx[0] = 0;
-        temporal_up_switch[0] = false;
+        temporal_up_switch[0] = true;
        num_ref_pics[0] = 1;
        pid_diff[0][0] = 2;

@ -65,7 +65,7 @@ struct GofInfoVP9 {
      case kTemporalStructureMode3:
        num_frames_in_gof = 4;
        temporal_idx[0] = 0;
-        temporal_up_switch[0] = false;
+        temporal_up_switch[0] = true;
        num_ref_pics[0] = 1;
        pid_diff[0][0] = 4;

@ -87,7 +87,7 @@ struct GofInfoVP9 {
      case kTemporalStructureMode4:
        num_frames_in_gof = 8;
        temporal_idx[0] = 0;
-        temporal_up_switch[0] = false;
+        temporal_up_switch[0] = true;
        num_ref_pics[0] = 1;
        pid_diff[0][0] = 4;

@ -97,12 +97,12 @@ struct GofInfoVP9 {
        pid_diff[1][0] = 1;

        temporal_idx[2] = 1;
-        temporal_up_switch[2] = true;
+        temporal_up_switch[2] = false;
        num_ref_pics[2] = 1;
        pid_diff[2][0] = 2;

        temporal_idx[3] = 2;
-        temporal_up_switch[3] = false;
+        temporal_up_switch[3] = true;
        num_ref_pics[3] = 2;
        pid_diff[3][0] = 1;
        pid_diff[3][1] = 2;
@ -113,7 +113,7 @@ struct GofInfoVP9 {
        pid_diff[4][0] = 4;

        temporal_idx[5] = 2;
-        temporal_up_switch[5] = false;
+        temporal_up_switch[5] = true;
        num_ref_pics[5] = 2;
        pid_diff[5][0] = 1;
        pid_diff[5][1] = 2;
@ -125,7 +125,7 @@ struct GofInfoVP9 {
        pid_diff[6][1] = 4;

        temporal_idx[7] = 2;
-        temporal_up_switch[7] = false;
+        temporal_up_switch[7] = true;
        num_ref_pics[7] = 2;
        pid_diff[7][0] = 1;
        pid_diff[7][1] = 2;
@ -195,7 +195,10 @@ struct RTPVideoHeaderVP9 {
  uint8_t temporal_idx;     // Temporal layer index, or kNoTemporalIdx.
  uint8_t spatial_idx;      // Spatial layer index, or kNoSpatialIdx.
  bool temporal_up_switch;  // True if upswitch to higher frame rate is possible
-                            // starting from this frame.
+                            // meaning subsequent higher temporal layer pictures
+                            // will not depend on any picture before the current
+                            // picture (in coding order) with temporal layer ID
+                            // greater than `temporal_idx` of this frame.
  bool inter_layer_predicted;  // Frame is dependent on directly lower spatial
                               // layer frame.

--- a/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.cc
+++ b/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.cc
@ -959,7 +959,7 @@ int LibvpxVp9Encoder::Encode(const VideoFrame& input_image,
    const size_t gof_idx = (pics_since_key_ + 1) % gof_.num_frames_in_gof;
    layer_id.temporal_layer_id = gof_.temporal_idx[gof_idx];

-    if (VideoCodecMode::kScreensharing == codec_.mode) {
+    if (codec_.mode == VideoCodecMode::kScreensharing) {
      const uint32_t frame_timestamp_ms =
          1000 * input_image.timestamp() / kVideoPayloadTypeFrequency;

@ -1212,8 +1212,7 @@ int LibvpxVp9Encoder::Encode(const VideoFrame& input_image,

 bool LibvpxVp9Encoder::PopulateCodecSpecific(CodecSpecificInfo* codec_specific,
                                             absl::optional<int>* spatial_idx,
-                                             const vpx_codec_cx_pkt& pkt,
-                                             uint32_t timestamp) {
+                                             const vpx_codec_cx_pkt& pkt) {
  RTC_CHECK(codec_specific != nullptr);
  codec_specific->codecType = kVideoCodecVP9;
  CodecSpecificInfoVP9* vp9_info = &(codec_specific->codecSpecific.VP9);
@ -1248,9 +1247,6 @@ bool LibvpxVp9Encoder::PopulateCodecSpecific(CodecSpecificInfo* codec_specific,
    *spatial_idx = layer_id.spatial_layer_id;
  }

-  // TODO(asapersson): this info has to be obtained from the encoder.
-  vp9_info->temporal_up_switch = false;
-
  const bool is_key_pic = (pics_since_key_ == 0);
  const bool is_inter_layer_pred_allowed =
      (inter_layer_pred_ == InterLayerPredMode::kOn ||
@ -1283,6 +1279,20 @@ bool LibvpxVp9Encoder::PopulateCodecSpecific(CodecSpecificInfo* codec_specific,
                       vp9_info);
  if (vp9_info->flexible_mode) {
    vp9_info->gof_idx = kNoGofIdx;
+    if (!svc_controller_) {
+      if (num_temporal_layers_ == 1) {
+        vp9_info->temporal_up_switch = true;
+      } else {
+        // In flexible mode with > 1 temporal layer but no SVC controller we
+        // can't techincally determine if a frame is an upswitch point, use
+        // gof-based data as proxy for now.
+        // TODO(sprang): Remove once SVC controller is the only choice.
+        vp9_info->gof_idx =
+            static_cast<uint8_t>(pics_since_key_ % gof_.num_frames_in_gof);
+        vp9_info->temporal_up_switch =
+            gof_.temporal_up_switch[vp9_info->gof_idx];
+      }
+    }
  } else {
    vp9_info->gof_idx =
        static_cast<uint8_t>(pics_since_key_ % gof_.num_frames_in_gof);
@ -1353,6 +1363,23 @@ bool LibvpxVp9Encoder::PopulateCodecSpecific(CodecSpecificInfo* codec_specific,
                svc_params_.scaling_factor_den[sid]);
      }
    }
+    if (is_flexible_mode_) {
+      // Populate data for legacy temporal-upswitch state.
+      // We can switch up to a higher temporal layer only if all temporal layers
+      // higher than this (within the current spatial layer) are switch points.
+      vp9_info->temporal_up_switch = true;
+      for (int i = layer_id.temporal_layer_id + 1; i < num_temporal_layers_;
+           ++i) {
+        // Assumes decode targets are always ordered first by spatial then by
+        // temporal id.
+        size_t dti_index =
+            (layer_id.spatial_layer_id * num_temporal_layers_) + i;
+        vp9_info->temporal_up_switch &=
+            (codec_specific->generic_frame_info
+                 ->decode_target_indications[dti_index] ==
+             DecodeTargetIndication::kSwitch);
+      }
+    }
  }
  return true;
 }
@ -1428,8 +1455,6 @@ void LibvpxVp9Encoder::FillReferenceIndices(const vpx_codec_cx_pkt& pkt,
    ref_buf_list.push_back(ref_buf_.at(0));
  }

-  size_t max_ref_temporal_layer_id = 0;
-
  std::vector<size_t> ref_pid_list;

  vp9_info->num_ref_pics = 0;
@ -1461,9 +1486,6 @@ void LibvpxVp9Encoder::FillReferenceIndices(const vpx_codec_cx_pkt& pkt,

      vp9_info->p_diff[vp9_info->num_ref_pics] = static_cast<uint8_t>(p_diff);
      ++vp9_info->num_ref_pics;
-
-      max_ref_temporal_layer_id =
-          std::max(max_ref_temporal_layer_id, ref_buf.temporal_layer_id);
    } else {
      RTC_DCHECK(inter_layer_predicted);
      // RTP spec only allows to use previous spatial layer for inter-layer
@ -1471,10 +1493,6 @@ void LibvpxVp9Encoder::FillReferenceIndices(const vpx_codec_cx_pkt& pkt,
      RTC_DCHECK_EQ(ref_buf.spatial_layer_id + 1, layer_id.spatial_layer_id);
    }
  }
-
-  vp9_info->temporal_up_switch =
-      (max_ref_temporal_layer_id <
-       static_cast<size_t>(layer_id.temporal_layer_id));
 }

 void LibvpxVp9Encoder::UpdateReferenceBuffers(const vpx_codec_cx_pkt& pkt,
@ -1636,8 +1654,7 @@ void LibvpxVp9Encoder::GetEncodedLayerFrame(const vpx_codec_cx_pkt* pkt) {

  codec_specific_ = {};
  absl::optional<int> spatial_index;
-  if (!PopulateCodecSpecific(&codec_specific_, &spatial_index, *pkt,
-                             input_image_->timestamp())) {
+  if (!PopulateCodecSpecific(&codec_specific_, &spatial_index, *pkt)) {
    // Drop the frame.
    encoded_image_.set_size(0);
    return;
--- a/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.h
+++ b/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.h
@ -67,8 +67,7 @@ class LibvpxVp9Encoder : public VP9Encoder {

  bool PopulateCodecSpecific(CodecSpecificInfo* codec_specific,
                             absl::optional<int>* spatial_idx,
-                             const vpx_codec_cx_pkt& pkt,
-                             uint32_t timestamp);
+                             const vpx_codec_cx_pkt& pkt);
  void FillReferenceIndices(const vpx_codec_cx_pkt& pkt,
                            const size_t pic_num,
                            const bool inter_layer_predicted,