diff --git a/src/modules/audio_conference_mixer/source/audio_conference_mixer_impl.cc b/src/modules/audio_conference_mixer/source/audio_conference_mixer_impl.cc index 64e6fbc53e..a2f21848f7 100644 --- a/src/modules/audio_conference_mixer/source/audio_conference_mixer_impl.cc +++ b/src/modules/audio_conference_mixer/source/audio_conference_mixer_impl.cc @@ -14,16 +14,51 @@ #include "audio_processing.h" #include "critical_section_wrapper.h" #include "map_wrapper.h" +#include "voice_engine/main/source/audio_frame_operations.h" #include "trace.h" namespace webrtc { namespace { + +// Mix |frame| into |mixed_frame|, with saturation protection and upmixing. +// These effects are applied to |frame| itself prior to mixing. Assumes that +// |mixed_frame| always has at least as many channels as |frame|. Supports +// stereo at most. +// +// TODO(andrew): consider not modifying |frame| here. +void MixFrames(AudioFrame* mixed_frame, AudioFrame* frame) { + assert(mixed_frame->num_channels_ >= frame->num_channels_); + // Divide by two to avoid saturation in the mixing. + *frame >>= 1; + if (mixed_frame->num_channels_ > frame->num_channels_) { + // We only support mono-to-stereo. + assert(mixed_frame->num_channels_ == 2 && + frame->num_channels_ == 1); + AudioFrameOperations::MonoToStereo(*frame); + } + + *mixed_frame += *frame; +} + +// Return the max number of channels from a |list| composed of AudioFrames. +int MaxNumChannels(const ListWrapper& list) { + ListItem* item = list.First(); + int max_num_channels = 1; + while (item) { + AudioFrame* frame = static_cast(item->GetItem()); + max_num_channels = std::max(max_num_channels, frame->num_channels_); + item = list.Next(item); + } + return max_num_channels; +} + void SetParticipantStatistics(ParticipantStatistics* stats, const AudioFrame& frame) { stats->participant = frame.id_; stats->level = 0; // TODO(andrew): to what should this be set? } + } // namespace MixerParticipant::MixerParticipant() @@ -283,25 +318,22 @@ WebRtc_Word32 AudioConferenceMixerImpl::Process() int retval = 0; WebRtc_Word32 audioLevel = 0; { - const ListItem* firstItem = mixList.First(); - // Assume mono. - WebRtc_UWord8 numberOfChannels = 1; - if(firstItem != NULL) - { - // Use the same number of channels as the first frame to be mixed. - numberOfChannels = static_cast( - firstItem->GetItem())->num_channels_; - } + CriticalSectionScoped cs(_crit.get()); + // TODO(henrike): it might be better to decide the number of channels // with an API instead of dynamically. - CriticalSectionScoped cs(_crit.get()); - if (!SetNumLimiterChannels(numberOfChannels)) + // Find the max channels over all mixing lists. + const int num_mixed_channels = std::max(MaxNumChannels(mixList), + std::max(MaxNumChannels(additionalFramesList), + MaxNumChannels(rampOutList))); + + if (!SetNumLimiterChannels(num_mixed_channels)) retval = -1; mixedAudio->UpdateFrame(-1, _timeStamp, NULL, 0, _outputFrequency, AudioFrame::kNormalSpeech, - AudioFrame::kVadPassive, numberOfChannels); + AudioFrame::kVadPassive, num_mixed_channels); _timeStamp += _sampleSize; @@ -1108,10 +1140,7 @@ WebRtc_Word32 AudioConferenceMixerImpl::MixFromList( position = 0; } AudioFrame* audioFrame = static_cast(item->GetItem()); - - // Divide by two to avoid saturation in the mixing. - *audioFrame >>= 1; - mixedAudio += *audioFrame; + MixFrames(&mixedAudio, audioFrame); SetParticipantStatistics(&_scratchMixedParticipants[position], *audioFrame); @@ -1145,9 +1174,7 @@ WebRtc_Word32 AudioConferenceMixerImpl::MixAnonomouslyFromList( while(item != NULL) { AudioFrame* audioFrame = static_cast(item->GetItem()); - // Divide by two to avoid saturation in the mixing. - *audioFrame >>= 1; - mixedAudio += *audioFrame; + MixFrames(&mixedAudio, audioFrame); item = audioFrameList.Next(item); } return 0; diff --git a/src/voice_engine/main/source/audio_frame_operations.cc b/src/voice_engine/main/source/audio_frame_operations.cc index 123dc92c35..28f5ca8eb6 100644 --- a/src/voice_engine/main/source/audio_frame_operations.cc +++ b/src/voice_engine/main/source/audio_frame_operations.cc @@ -12,7 +12,6 @@ #include "module_common_types.h" namespace webrtc { -namespace voe { int AudioFrameOperations::MonoToStereo(AudioFrame& frame) { if (frame.num_channels_ != 1) { @@ -101,6 +100,5 @@ int AudioFrameOperations::ScaleWithSat(float scale, AudioFrame& frame) { return 0; } -} // namespace voe } // namespace webrtc diff --git a/src/voice_engine/main/source/audio_frame_operations.h b/src/voice_engine/main/source/audio_frame_operations.h index e680dcb6d3..753e4bfa2c 100644 --- a/src/voice_engine/main/source/audio_frame_operations.h +++ b/src/voice_engine/main/source/audio_frame_operations.h @@ -17,10 +17,9 @@ namespace webrtc { class AudioFrame; -namespace voe { - -// TODO(andrew): unify this with utility.h. Change reference parameters to -// pointers. +// TODO(andrew): consolidate this with utility.h and audio_frame_manipulator.h. +// Change reference parameters to pointers. Move out of VoE to a common place. +// Consider using a namespace rather than class. class AudioFrameOperations { public: static int MonoToStereo(AudioFrame& frame); @@ -38,7 +37,6 @@ class AudioFrameOperations { static int ScaleWithSat(float scale, AudioFrame& frame); }; -} // namespace voe } // namespace webrtc #endif // #ifndef WEBRTC_VOICE_ENGINE_AUDIO_FRAME_OPERATIONS_H_ diff --git a/src/voice_engine/main/test/auto_test/standard/mixing_test.cc b/src/voice_engine/main/test/auto_test/standard/mixing_test.cc new file mode 100644 index 0000000000..5e6ca4a2f5 --- /dev/null +++ b/src/voice_engine/main/test/auto_test/standard/mixing_test.cc @@ -0,0 +1,243 @@ +/* + * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include + +#include "after_initialization_fixture.h" +#include "test/testsupport/fileutils.h" + +namespace webrtc { +namespace { + +const int16_t kLimiterHeadroom = 29204; // == -1 dbFS +const int16_t kInt16Max = 0x7fff; +const int kSampleRateHz = 16000; +const int kTestDurationMs = 4000; + +} // namespace + +class MixingTest : public AfterInitializationFixture { + protected: + MixingTest() + : input_filename_(test::OutputPath() + "mixing_test_input.pcm"), + output_filename_(test::OutputPath() + "mixing_test_output.pcm") { + } + + // Creates and mixes |num_remote_streams| which play a file "as microphone" + // with |num_local_streams| which play a file "locally", using a constant + // amplitude of |input_value|. The local streams manifest as "anonymous" + // mixing participants, meaning they will be mixed regardless of the number + // of participants. (A stream is a VoiceEngine "channel"). + // + // The mixed output is verified to always fall between |max_output_value| and + // |min_output_value|, after a startup phase. + // + // |num_remote_streams_using_mono| of the remote streams use mono, with the + // remainder using stereo. + void RunMixingTest(int num_remote_streams, + int num_local_streams, + int num_remote_streams_using_mono, + int16_t input_value, + int16_t max_output_value, + int16_t min_output_value) { + ASSERT_LE(num_remote_streams_using_mono, num_remote_streams); + + GenerateInputFile(input_value); + + std::vector local_streams(num_local_streams); + for (size_t i = 0; i < local_streams.size(); ++i) { + local_streams[i] = voe_base_->CreateChannel(); + EXPECT_NE(-1, local_streams[i]); + } + StartLocalStreams(local_streams); + TEST_LOG("Playing %d local streams.\n", num_local_streams); + + std::vector remote_streams(num_remote_streams); + for (size_t i = 0; i < remote_streams.size(); ++i) { + remote_streams[i] = voe_base_->CreateChannel(); + EXPECT_NE(-1, remote_streams[i]); + } + StartRemoteStreams(remote_streams, num_remote_streams_using_mono); + TEST_LOG("Playing %d remote streams.\n", num_remote_streams); + + // Start recording the mixed output and wait. + EXPECT_EQ(0, voe_file_->StartRecordingPlayout(-1 /* record meeting */, + output_filename_.c_str())); + Sleep(kTestDurationMs); + EXPECT_EQ(0, voe_file_->StopRecordingPlayout(-1)); + + StopLocalStreams(local_streams); + StopRemoteStreams(remote_streams); + + VerifyMixedOutput(max_output_value, min_output_value); + + // Cleanup the files in case another test uses different lengths. + ASSERT_EQ(0, remove(input_filename_.c_str())); + ASSERT_EQ(0, remove(output_filename_.c_str())); + } + + private: + // Generate input file with constant values equal to |input_value|. The file + // will be one second longer than the duration of the test. + void GenerateInputFile(int16_t input_value) { + FILE* input_file = fopen(input_filename_.c_str(), "wb"); + ASSERT_TRUE(input_file != NULL); + for (int i = 0; i < kSampleRateHz / 1000 * (kTestDurationMs + 1000); i++) { + ASSERT_EQ(1u, fwrite(&input_value, sizeof(input_value), 1, input_file)); + } + ASSERT_EQ(0, fclose(input_file)); + } + + void VerifyMixedOutput(int16_t max_output_value, int16_t min_output_value) { + // Verify the mixed output. + FILE* output_file = fopen(output_filename_.c_str(), "rb"); + ASSERT_TRUE(output_file != NULL); + int16_t output_value = 0; + // Skip the first 100 ms to avoid initialization and ramping-in effects. + EXPECT_EQ(0, fseek(output_file, sizeof(output_value) * kSampleRateHz / 10, + SEEK_SET)); + int samples_read = 0; + while (fread(&output_value, sizeof(output_value), 1, output_file) == 1) { + samples_read++; + EXPECT_LE(output_value, max_output_value); + EXPECT_GE(output_value, min_output_value); + } + // Ensure the recording length is close to the duration of the test. + ASSERT_GE((samples_read * 1000.0f) / kSampleRateHz, + 0.9f * kTestDurationMs); + // Ensure we read the entire file. + ASSERT_NE(0, feof(output_file)); + ASSERT_EQ(0, fclose(output_file)); + } + + // Start up local streams ("anonymous" participants). + void StartLocalStreams(const std::vector& streams) { + for (size_t i = 0; i < streams.size(); ++i) { + EXPECT_EQ(0, voe_base_->StartPlayout(streams[i])); + EXPECT_EQ(0, voe_file_->StartPlayingFileLocally(streams[i], + input_filename_.c_str(), true)); + } + } + + void StopLocalStreams(const std::vector& streams) { + for (size_t i = 0; i < streams.size(); ++i) { + EXPECT_EQ(0, voe_base_->StopPlayout(streams[i])); + EXPECT_EQ(0, voe_base_->DeleteChannel(streams[i])); + } + } + + // Start up remote streams ("normal" participants). + void StartRemoteStreams(const std::vector& streams, + int num_remote_streams_using_mono) { + // Use L16 at 16kHz to minimize distortion (file recording is 16kHz and + // resampling will cause distortion). + CodecInst codec_inst; + strcpy(codec_inst.plname, "L16"); + codec_inst.channels = 1; + codec_inst.plfreq = kSampleRateHz; + codec_inst.pltype = 105; + codec_inst.pacsize = codec_inst.plfreq / 100; + codec_inst.rate = codec_inst.plfreq * sizeof(int16_t) * 8; // 8 bits/byte. + + for (int i = 0; i < num_remote_streams_using_mono; ++i) { + StartRemoteStream(streams[i], codec_inst, 1234 + 2 * i); + } + + // The remainder of the streams will use stereo. + codec_inst.channels = 2; + codec_inst.pltype++; + for (size_t i = num_remote_streams_using_mono; i < streams.size(); ++i) { + StartRemoteStream(streams[i], codec_inst, 1234 + 2 * i); + } + } + + // Start up a single remote stream. + void StartRemoteStream(int stream, const CodecInst& codec_inst, int port) { + EXPECT_EQ(0, voe_codec_->SetRecPayloadType(stream, codec_inst)); + EXPECT_EQ(0, voe_base_->SetLocalReceiver(stream, port)); + EXPECT_EQ(0, voe_base_->SetSendDestination(stream, port, "127.0.0.1")); + EXPECT_EQ(0, voe_base_->StartReceive(stream)); + EXPECT_EQ(0, voe_base_->StartPlayout(stream)); + EXPECT_EQ(0, voe_codec_->SetSendCodec(stream, codec_inst)); + EXPECT_EQ(0, voe_base_->StartSend(stream)); + EXPECT_EQ(0, voe_file_->StartPlayingFileAsMicrophone(stream, + input_filename_.c_str(), true)); + } + + void StopRemoteStreams(const std::vector& streams) { + for (size_t i = 0; i < streams.size(); ++i) { + EXPECT_EQ(0, voe_base_->StopSend(streams[i])); + EXPECT_EQ(0, voe_base_->StopPlayout(streams[i])); + EXPECT_EQ(0, voe_base_->StopReceive(streams[i])); + EXPECT_EQ(0, voe_base_->DeleteChannel(streams[i])); + } + } + + const std::string input_filename_; + const std::string output_filename_; +}; + +// These tests assume a maximum of three mixed participants. We typically allow +// a +/- 10% range around the expected output level to account for distortion +// from coding and processing in the loopback chain. +TEST_F(MixingTest, FourChannelsWithOnlyThreeMixed) { + const int16_t kInputValue = 1000; + const int16_t kExpectedOutput = kInputValue * 3; + RunMixingTest(4, 0, 4, kInputValue, 1.1 * kExpectedOutput, + 0.9 * kExpectedOutput); +} + +// Ensure the mixing saturation protection is working. We can do this because +// the mixing limiter is given some headroom, so the expected output is less +// than full scale. +TEST_F(MixingTest, VerifySaturationProtection) { + const int16_t kInputValue = 20000; + const int16_t kExpectedOutput = kLimiterHeadroom; + // If this isn't satisfied, we're not testing anything. + ASSERT_GT(kInputValue * 3, kInt16Max); + ASSERT_LT(1.1 * kExpectedOutput, kInt16Max); + RunMixingTest(3, 0, 3, kInputValue, 1.1 * kExpectedOutput, + 0.9 * kExpectedOutput); +} + +TEST_F(MixingTest, SaturationProtectionHasNoEffectOnOneChannel) { + const int16_t kInputValue = kInt16Max; + const int16_t kExpectedOutput = kInt16Max; + // If this isn't satisfied, we're not testing anything. + ASSERT_GT(0.95 * kExpectedOutput, kLimiterHeadroom); + // Tighter constraints are required here to properly test this. + RunMixingTest(1, 0, 1, kInputValue, kExpectedOutput, + 0.95 * kExpectedOutput); +} + +TEST_F(MixingTest, VerifyAnonymousAndNormalParticipantMixing) { + const int16_t kInputValue = 1000; + const int16_t kExpectedOutput = kInputValue * 2; + RunMixingTest(1, 1, 1, kInputValue, 1.1 * kExpectedOutput, + 0.9 * kExpectedOutput); +} + +TEST_F(MixingTest, AnonymousParticipantsAreAlwaysMixed) { + const int16_t kInputValue = 1000; + const int16_t kExpectedOutput = kInputValue * 4; + RunMixingTest(3, 1, 3, kInputValue, 1.1 * kExpectedOutput, + 0.9 * kExpectedOutput); +} + +TEST_F(MixingTest, VerifyStereoAndMonoMixing) { + const int16_t kInputValue = 1000; + const int16_t kExpectedOutput = kInputValue * 2; + RunMixingTest(2, 0, 1, kInputValue, 1.1 * kExpectedOutput, + 0.9 * kExpectedOutput); +} + +} // namespace webrtc diff --git a/src/voice_engine/main/test/voice_engine_tests.gypi b/src/voice_engine/main/test/voice_engine_tests.gypi index ee5f4073e7..4cd192ea34 100644 --- a/src/voice_engine/main/test/voice_engine_tests.gypi +++ b/src/voice_engine/main/test/voice_engine_tests.gypi @@ -52,6 +52,7 @@ 'auto_test/standard/hardware_before_streaming_test.cc', 'auto_test/standard/hardware_test.cc', 'auto_test/standard/manual_hold_test.cc', + 'auto_test/standard/mixing_test.cc', 'auto_test/standard/neteq_stats_test.cc', 'auto_test/standard/neteq_test.cc', 'auto_test/standard/network_before_streaming_test.cc',