From 499b3b6c7edb95f48e74e0520c48aeae2217ba54 Mon Sep 17 00:00:00 2001 From: Danil Chapovalov Date: Thu, 28 Nov 2019 17:38:35 +0100 Subject: [PATCH] In RtpDepacketizerAV1 use aggregation header to detect key frames instead of guessing based on presence of the sequence header OBU. Bug: webrtc:11042 Change-Id: I9a0674249feceebb07299ea965c62e87499f6baf Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/161013 Reviewed-by: Philip Eliasson Commit-Queue: Danil Chapovalov Cr-Commit-Position: refs/heads/master@{#29958} --- .../rtp_rtcp/source/rtp_depacketizer_av1.cc | 76 ++-------- .../source/rtp_depacketizer_av1_unittest.cc | 131 +++--------------- 2 files changed, 32 insertions(+), 175 deletions(-) diff --git a/modules/rtp_rtcp/source/rtp_depacketizer_av1.cc b/modules/rtp_rtcp/source/rtp_depacketizer_av1.cc index 52c62f87ab..71890e9b51 100644 --- a/modules/rtp_rtcp/source/rtp_depacketizer_av1.cc +++ b/modules/rtp_rtcp/source/rtp_depacketizer_av1.cc @@ -26,7 +26,7 @@ namespace { // RTP payload syntax: // 0 1 2 3 4 5 6 7 // +-+-+-+-+-+-+-+-+ -// |Z|Y| W |-|-|-|-| (REQUIRED) +// |Z|Y| W |N|-|-|-| (REQUIRED) // +=+=+=+=+=+=+=+=+ (REPEATED W-1 times, or any times if W = 0) // |1| | // +-+ OBU fragment| @@ -154,7 +154,6 @@ struct ObuInfo { // also has Sequence Header OBU. using VectorObuInfo = absl::InlinedVector; -constexpr int kObuTypeSequenceHeader = 1; constexpr uint8_t kObuSizePresentBit = 0b0'0000'010; bool ObuHasExtension(uint8_t obu_header) { @@ -165,10 +164,6 @@ bool ObuHasSize(uint8_t obu_header) { return obu_header & kObuSizePresentBit; } -int ObuType(uint8_t obu_header) { - return (obu_header & 0b0'1111'000u) >> 3; -} - bool RtpStartsWithFragment(uint8_t aggregation_header) { return aggregation_header & 0b1000'0000u; } @@ -178,6 +173,9 @@ bool RtpEndsWithFragment(uint8_t aggregation_header) { int RtpNumObus(uint8_t aggregation_header) { // 0 for any number of obus. return (aggregation_header & 0b0011'0000u) >> 4; } +int RtpStartsNewCodedVideoSequence(uint8_t aggregation_header) { + return aggregation_header & 0b0000'1000u; +} // Reorgonizes array of rtp payloads into array of obus: // fills ObuInfo::data field. @@ -373,16 +371,18 @@ bool RtpDepacketizerAv1::Parse(ParsedPayload* parsed_payload, RTC_DLOG(LS_ERROR) << "Empty rtp payload."; return false; } + uint8_t aggregation_header = payload_data[0]; + if (RtpStartsNewCodedVideoSequence(aggregation_header) && + RtpStartsWithFragment(aggregation_header)) { + // new coded video sequence can't start from an OBU fragment. + return false; + } + // To assemble frame, all of the rtp payload is required, including // aggregation header. parsed_payload->payload = payload_data; parsed_payload->payload_length = payload_data_length; - rtc::ByteBufferReader payload(reinterpret_cast(payload_data), - payload_data_length); - uint8_t aggregation_header; - RTC_CHECK(payload.ReadUInt8(&aggregation_header)); - parsed_payload->video.codec = VideoCodecType::kVideoCodecAV1; // These are not accurate since frame may consist of several packet aligned // chunks of obus, but should be good enough for most cases. It might produce @@ -393,57 +393,11 @@ bool RtpDepacketizerAv1::Parse(ParsedPayload* parsed_payload, !RtpStartsWithFragment(aggregation_header); parsed_payload->video.is_last_packet_in_frame = !RtpEndsWithFragment(aggregation_header); - parsed_payload->video.frame_type = VideoFrameType::kVideoFrameDelta; - // If packet starts a frame, check if it contains Sequence Header OBU. - // In that case treat it as key frame packet. - if (parsed_payload->video.is_first_packet_in_frame) { - int num_expected_obus = RtpNumObus(aggregation_header); - - // The only OBU that can preceed SequenceHeader is a TemporalDelimiter OBU, - // so check no more than two OBUs while searching for SH. - for (int obu_index = 1; payload.Length() > 0 && obu_index <= 2; - ++obu_index) { - uint64_t fragment_size; - // When num_expected_obus > 0, last OBU (fragment) is not preceeded by - // the size field. See W field in - // https://aomediacodec.github.io/av1-rtp-spec/#43-av1-aggregation-header - bool has_fragment_size = (obu_index != num_expected_obus); - if (has_fragment_size) { - if (!payload.ReadUVarint(&fragment_size)) { - RTC_DLOG(LS_WARNING) - << "Failed to read OBU fragment size for OBU#" << obu_index; - return false; - } - if (fragment_size > payload.Length()) { - RTC_DLOG(LS_WARNING) << "OBU fragment size " << fragment_size - << " exceeds remaining payload size " - << payload.Length() << " for OBU#" << obu_index; - // Malformed input: written size is larger than remaining buffer. - return false; - } - } else { - fragment_size = payload.Length(); - } - // Though it is inpractical to pass empty fragments, it is allowed. - if (fragment_size == 0) { - RTC_LOG(LS_WARNING) - << "Weird obu of size 0 at offset " - << (payload_data_length - payload.Length()) << ", skipping."; - continue; - } - uint8_t obu_header = *reinterpret_cast(payload.Data()); - if (ObuType(obu_header) == kObuTypeSequenceHeader) { - // TODO(bugs.webrtc.org/11042): Check frame_header OBU and/or frame OBU - // too for other conditions of the start of a new coded video sequence. - // For proper checks checking single packet might not be enough. See - // https://aomediacodec.github.io/av1-spec/av1-spec.pdf section 7.5 - parsed_payload->video.frame_type = VideoFrameType::kVideoFrameKey; - break; - } - payload.Consume(fragment_size); - } - } + parsed_payload->video.frame_type = + RtpStartsNewCodedVideoSequence(aggregation_header) + ? VideoFrameType::kVideoFrameKey + : VideoFrameType::kVideoFrameDelta; return true; } diff --git a/modules/rtp_rtcp/source/rtp_depacketizer_av1_unittest.cc b/modules/rtp_rtcp/source/rtp_depacketizer_av1_unittest.cc index cf55aaed20..d0d0670d15 100644 --- a/modules/rtp_rtcp/source/rtp_depacketizer_av1_unittest.cc +++ b/modules/rtp_rtcp/source/rtp_depacketizer_av1_unittest.cc @@ -19,12 +19,9 @@ namespace { using ::testing::ElementsAre; // Signals number of the OBU (fragments) in the packet. -constexpr uint8_t kObuCountAny = 0b0000'0000; -constexpr uint8_t kObuCountOne = 0b0001'0000; -constexpr uint8_t kObuCountTwo = 0b0010'0000; +constexpr uint8_t kObuCountOne = 0b00'01'0000; constexpr uint8_t kObuHeaderSequenceHeader = 0b0'0001'000; -constexpr uint8_t kObuHeaderTemporalDelimiter = 0b0'0010'000; constexpr uint8_t kObuHeaderFrame = 0b0'0110'000; constexpr uint8_t kObuHeaderHasSize = 0b0'0000'010; @@ -73,131 +70,37 @@ TEST(RtpDepacketizerAv1Test, ParseTreatsNoWillContinueFlagAsEndOfFrame) { EXPECT_TRUE(parsed.video.is_last_packet_in_frame); } -TEST(RtpDepacketizerAv1Test, ParseTreatsStartOfSequenceHeaderAsKeyFrame) { - const uint8_t packet[] = {kObuCountOne, kObuHeaderSequenceHeader}; - RtpDepacketizerAv1 depacketizer; - RtpDepacketizer::ParsedPayload parsed; - ASSERT_TRUE(depacketizer.Parse(&parsed, packet, sizeof(packet))); - EXPECT_TRUE(parsed.video.is_first_packet_in_frame); - EXPECT_TRUE(parsed.video.frame_type == VideoFrameType::kVideoFrameKey); -} - -TEST(RtpDepacketizerAv1Test, ParseTreatsNotStartOfFrameAsDeltaFrame) { - const uint8_t packet[] = { - (uint8_t{1} << 7) | kObuCountOne, - // Byte that look like start of sequence header, but since it is not start - // of an OBU, it is actually not a start of sequence header. - kObuHeaderSequenceHeader}; - RtpDepacketizerAv1 depacketizer; - RtpDepacketizer::ParsedPayload parsed; - ASSERT_TRUE(depacketizer.Parse(&parsed, packet, sizeof(packet))); - EXPECT_FALSE(parsed.video.is_first_packet_in_frame); - EXPECT_TRUE(parsed.video.frame_type == VideoFrameType::kVideoFrameDelta); -} - TEST(RtpDepacketizerAv1Test, - ParseTreatsStartOfFrameWithoutSequenceHeaderAsDeltaFrame) { - const uint8_t packet[] = {kObuCountOne, kObuHeaderFrame}; - RtpDepacketizerAv1 depacketizer; - RtpDepacketizer::ParsedPayload parsed; - ASSERT_TRUE(depacketizer.Parse(&parsed, packet, sizeof(packet))); - EXPECT_TRUE(parsed.video.is_first_packet_in_frame); - EXPECT_TRUE(parsed.video.frame_type == VideoFrameType::kVideoFrameDelta); -} - -TEST(RtpDepacketizerAv1Test, ParseFindsSequenceHeaderBehindFragmentSize1) { - const uint8_t packet[] = {kObuCountAny, - 1, // size of the next fragment + ParseUsesNewCodedVideoSequenceBitAsKeyFrameIndidcator) { + const uint8_t packet[] = {(uint8_t{1} << 3) | kObuCountOne, kObuHeaderSequenceHeader}; RtpDepacketizerAv1 depacketizer; RtpDepacketizer::ParsedPayload parsed; ASSERT_TRUE(depacketizer.Parse(&parsed, packet, sizeof(packet))); - EXPECT_TRUE(parsed.video.frame_type == VideoFrameType::kVideoFrameKey); -} - -TEST(RtpDepacketizerAv1Test, ParseFindsSequenceHeaderBehindFragmentSize2) { - const uint8_t packet[] = {kObuCountTwo, - 2, // size of the next fragment - kObuHeaderSequenceHeader, - 42, // SH payload. - kObuHeaderFrame}; - RtpDepacketizerAv1 depacketizer; - RtpDepacketizer::ParsedPayload parsed; - ASSERT_TRUE(depacketizer.Parse(&parsed, packet, sizeof(packet))); + EXPECT_TRUE(parsed.video.is_first_packet_in_frame); EXPECT_TRUE(parsed.video.frame_type == VideoFrameType::kVideoFrameKey); } TEST(RtpDepacketizerAv1Test, - ParseFindsSequenceHeaderBehindMultiByteFragmentSize) { - const uint8_t packet[] = {kObuCountTwo, - 0b1000'0101, // leb128 encoded value of 5 - 0b1000'0000, // using 3 bytes - 0b0000'0000, // to encode the value. - kObuHeaderSequenceHeader, - 8, // 4 bytes of SH payload. - 0, - 0, - 0, - kObuHeaderFrame}; - RtpDepacketizerAv1 depacketizer; - RtpDepacketizer::ParsedPayload parsed; - ASSERT_TRUE(depacketizer.Parse(&parsed, packet, sizeof(packet))); - EXPECT_TRUE(parsed.video.frame_type == VideoFrameType::kVideoFrameKey); -} - -TEST(RtpDepacketizerAv1Test, ParseFindsSequenceHeaderBehindTemporalDelimiter) { - const uint8_t packet[] = {kObuCountTwo, - 1, // size of the next fragment - kObuHeaderTemporalDelimiter, - kObuHeaderSequenceHeader, - 8, // 4 bytes of SH payload. - 0, - 0, - 0}; - RtpDepacketizerAv1 depacketizer; - RtpDepacketizer::ParsedPayload parsed; - ASSERT_TRUE(depacketizer.Parse(&parsed, packet, sizeof(packet))); - EXPECT_TRUE(parsed.video.frame_type == VideoFrameType::kVideoFrameKey); -} - -TEST(RtpDepacketizerAv1Test, - ParseFindsSequenceHeaderBehindTemporalDelimiterAndSize) { - const uint8_t packet[] = {kObuCountAny, - 1, // size of the next fragment - kObuHeaderTemporalDelimiter, - 5, // size of the next fragment - kObuHeaderSequenceHeader, - 8, // 4 bytes of SH payload. - 0, - 0, - 0, - 1, // size of the next fragment - kObuHeaderFrame}; - RtpDepacketizerAv1 depacketizer; - RtpDepacketizer::ParsedPayload parsed; - ASSERT_TRUE(depacketizer.Parse(&parsed, packet, sizeof(packet))); - EXPECT_TRUE(parsed.video.frame_type == VideoFrameType::kVideoFrameKey); -} - -TEST(RtpDepacketizerAv1Test, ParseSkipsEmptyFragments) { - static_assert(kObuHeaderSequenceHeader == 8, ""); - const uint8_t packet[] = {kObuCountAny, - 0, // size of the next fragment - 8, // size of the next fragment that look like SH - kObuHeaderFrame, - 1, - 2, - 3, - 4, - 5, - 6, - 7}; + ParseUsesUnsetNewCodedVideoSequenceBitAsDeltaFrameIndidcator) { + const uint8_t packet[] = {(uint8_t{0} << 3) | kObuCountOne, + kObuHeaderSequenceHeader}; RtpDepacketizerAv1 depacketizer; RtpDepacketizer::ParsedPayload parsed; ASSERT_TRUE(depacketizer.Parse(&parsed, packet, sizeof(packet))); + EXPECT_TRUE(parsed.video.is_first_packet_in_frame); EXPECT_TRUE(parsed.video.frame_type == VideoFrameType::kVideoFrameDelta); } +TEST(RtpDepacketizerAv1Test, + ParseRejectsPacketWithNewCVSAndContinuationFlagsBothSet) { + const uint8_t packet[] = {0b10'00'1000 | kObuCountOne, + kObuHeaderSequenceHeader}; + RtpDepacketizerAv1 depacketizer; + RtpDepacketizer::ParsedPayload parsed; + EXPECT_FALSE(depacketizer.Parse(&parsed, packet, sizeof(packet))); +} + TEST(RtpDepacketizerAv1Test, AssembleFrameSetsOBUPayloadSizeWhenAbsent) { const uint8_t payload1[] = {0b00'01'0000, // aggregation header 0b0'0110'000, // / Frame