AEC3: Downmix multichannel signals before delay estimation

Multichannel signals are downmixed to mono before decimation and delay estimation. This is useful when not all channels play audio content. The feature can be toggled in the AEC3 configuration. Bug: webrtc:10913 Change-Id: I7d40edf7732bb51fec69e7f3ca063d821c5069c4 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/151762 Commit-Queue: Gustaf Ullberg <gustaf@webrtc.org> Reviewed-by: Per Åhgren <peah@webrtc.org> Cr-Commit-Position: refs/heads/master@{#29126}
2019-09-10 09:36:43 +02:00
parent d181ee798d
commit ee84d39fce
15 changed files with 89 additions and 55 deletions
--- a/modules/audio_processing/aec3/block_processor.cc
+++ b/modules/audio_processing/aec3/block_processor.cc
@ -165,7 +165,7 @@ void BlockProcessorImpl::ProcessCapture(
    // alignment.
    estimated_delay_ = delay_controller_->GetDelay(
        render_buffer_->GetDownsampledRenderBuffer(), render_buffer_->Delay(),
-        (*capture_block)[0][0]);
+        (*capture_block)[0]);

    if (estimated_delay_) {
      bool delay_change =
--- a/modules/audio_processing/aec3/decimator.cc
+++ b/modules/audio_processing/aec3/decimator.cc
@ -69,14 +69,32 @@ Decimator::Decimator(size_t down_sampling_factor)
             down_sampling_factor_ == 8);
 }

-void Decimator::Decimate(rtc::ArrayView<const float> in,
+void Decimator::Decimate(const std::vector<std::vector<float>>& in,
+                         bool downmix,
                         rtc::ArrayView<float> out) {
-  RTC_DCHECK_EQ(kBlockSize, in.size());
+  RTC_DCHECK_EQ(kBlockSize, in[0].size());
  RTC_DCHECK_EQ(kBlockSize / down_sampling_factor_, out.size());
+  std::array<float, kBlockSize> in_downmixed;
  std::array<float, kBlockSize> x;

+  // Mix channels before decimation.
+  std::copy(in[0].begin(), in[0].end(), in_downmixed.begin());
+  if (downmix && in.size() > 1) {
+    for (size_t channel = 1; channel < in.size(); channel++) {
+      const auto& data = in[channel];
+      for (size_t i = 0; i < kBlockSize; i++) {
+        in_downmixed[i] += data[i];
+      }
+    }
+
+    const float one_by_num_channels = 1.f / in.size();
+    for (size_t i = 0; i < kBlockSize; i++) {
+      in_downmixed[i] *= one_by_num_channels;
+    }
+  }
+
  // Limit the frequency content of the signal to avoid aliasing.
-  anti_aliasing_filter_.Process(in, x);
+  anti_aliasing_filter_.Process(in_downmixed, x);

  // Reduce the impact of near-end noise.
  noise_reduction_filter_.Process(x);
--- a/modules/audio_processing/aec3/decimator.h
+++ b/modules/audio_processing/aec3/decimator.h
@ -12,6 +12,7 @@
 #define MODULES_AUDIO_PROCESSING_AEC3_DECIMATOR_H_

 #include <array>
+#include <vector>

 #include "api/array_view.h"
 #include "modules/audio_processing/aec3/aec3_common.h"
@ -26,7 +27,9 @@ class Decimator {
  explicit Decimator(size_t down_sampling_factor);

  // Downsamples the signal.
-  void Decimate(rtc::ArrayView<const float> in, rtc::ArrayView<float> out);
+  void Decimate(const std::vector<std::vector<float>>& in,
+                bool downmix,
+                rtc::ArrayView<float> out);

 private:
  const size_t down_sampling_factor_;
--- a/modules/audio_processing/aec3/decimator_unittest.cc
+++ b/modules/audio_processing/aec3/decimator_unittest.cc
@ -15,6 +15,7 @@
 #include <algorithm>
 #include <array>
 #include <cmath>
+#include <cstring>
 #include <numeric>
 #include <string>
 #include <vector>
@ -57,10 +58,11 @@ void ProduceDecimatedSinusoidalOutputPower(int sample_rate_hz,

  for (size_t k = 0; k < kNumBlocks; ++k) {
    std::vector<float> sub_block(sub_block_size);
-
-    decimator.Decimate(
-        rtc::ArrayView<const float>(&input[k * kBlockSize], kBlockSize),
-        sub_block);
+    std::vector<std::vector<float>> input_multichannel(
+        1, std::vector<float>(kBlockSize));
+    memcpy(input_multichannel[0].data(), &input[k * kBlockSize],
+           kBlockSize * sizeof(float));
+    decimator.Decimate(input_multichannel, true, sub_block);

    std::copy(sub_block.begin(), sub_block.end(),
              output.begin() + k * sub_block_size);
@ -105,24 +107,24 @@ TEST(Decimator, NoLeakageFromUpperFrequencies) {
 // Verifies the check for the input size.
 TEST(Decimator, WrongInputSize) {
  Decimator decimator(4);
-  std::vector<float> x(std::vector<float>(kBlockSize - 1, 0.f));
+  std::vector<std::vector<float>> x(1, std::vector<float>(kBlockSize - 1, 0.f));
  std::array<float, kBlockSize / 4> x_downsampled;
-  EXPECT_DEATH(decimator.Decimate(x, x_downsampled), "");
+  EXPECT_DEATH(decimator.Decimate(x, true, x_downsampled), "");
 }

 // Verifies the check for non-null output parameter.
 TEST(Decimator, NullOutput) {
  Decimator decimator(4);
-  std::vector<float> x(std::vector<float>(kBlockSize, 0.f));
-  EXPECT_DEATH(decimator.Decimate(x, nullptr), "");
+  std::vector<std::vector<float>> x(1, std::vector<float>(kBlockSize, 0.f));
+  EXPECT_DEATH(decimator.Decimate(x, true, nullptr), "");
 }

 // Verifies the check for the output size.
 TEST(Decimator, WrongOutputSize) {
  Decimator decimator(4);
-  std::vector<float> x(std::vector<float>(kBlockSize, 0.f));
+  std::vector<std::vector<float>> x(1, std::vector<float>(kBlockSize, 0.f));
  std::array<float, kBlockSize / 4 - 1> x_downsampled;
-  EXPECT_DEATH(decimator.Decimate(x, x_downsampled), "");
+  EXPECT_DEATH(decimator.Decimate(x, true, x_downsampled), "");
 }

 // Verifies the check for the correct downsampling factor.
--- a/modules/audio_processing/aec3/echo_path_delay_estimator.cc
+++ b/modules/audio_processing/aec3/echo_path_delay_estimator.cc
@ -42,7 +42,8 @@ EchoPathDelayEstimator::EchoPathDelayEstimator(
          config.delay.delay_candidate_detection_threshold),
      matched_filter_lag_aggregator_(data_dumper_,
                                     matched_filter_.GetMaxFilterLag(),
-                                     config.delay.delay_selection_thresholds) {
+                                     config.delay.delay_selection_thresholds),
+      downmix_(config.delay.downmix_before_delay_estimation) {
  RTC_DCHECK(data_dumper);
  RTC_DCHECK(down_sampling_factor_ > 0);
 }
@ -55,15 +56,13 @@ void EchoPathDelayEstimator::Reset(bool reset_delay_confidence) {

 absl::optional<DelayEstimate> EchoPathDelayEstimator::EstimateDelay(
    const DownsampledRenderBuffer& render_buffer,
-    rtc::ArrayView<const float> capture) {
-  RTC_DCHECK_EQ(kBlockSize, capture.size());
+    const std::vector<std::vector<float>>& capture) {
+  RTC_DCHECK_EQ(kBlockSize, capture[0].size());

  std::array<float, kBlockSize> downsampled_capture_data;
  rtc::ArrayView<float> downsampled_capture(downsampled_capture_data.data(),
                                            sub_block_size_);
-  data_dumper_->DumpWav("aec3_capture_decimator_input", capture.size(),
-                        capture.data(), 16000, 1);
-  capture_decimator_.Decimate(capture, downsampled_capture);
+  capture_decimator_.Decimate(capture, downmix_, downsampled_capture);
  data_dumper_->DumpWav("aec3_capture_decimator_output",
                        downsampled_capture.size(), downsampled_capture.data(),
                        16000 / down_sampling_factor_, 1);
--- a/modules/audio_processing/aec3/echo_path_delay_estimator.h
+++ b/modules/audio_processing/aec3/echo_path_delay_estimator.h
@ -42,7 +42,7 @@ class EchoPathDelayEstimator {
  // Produce a delay estimate if such is avaliable.
  absl::optional<DelayEstimate> EstimateDelay(
      const DownsampledRenderBuffer& render_buffer,
-      rtc::ArrayView<const float> capture);
+      const std::vector<std::vector<float>>& capture);

  // Log delay estimator properties.
  void LogDelayEstimationProperties(int sample_rate_hz, size_t shift) const {
@ -65,6 +65,7 @@ class EchoPathDelayEstimator {
  absl::optional<DelayEstimate> old_aggregated_lag_;
  size_t consistent_estimate_counter_ = 0;
  ClockdriftDetector clockdrift_detector_;
+  bool downmix_;

  // Internal reset method with more granularity.
  void Reset(bool reset_lag_aggregator, bool reset_delay_confidence);
--- a/modules/audio_processing/aec3/echo_path_delay_estimator_unittest.cc
+++ b/modules/audio_processing/aec3/echo_path_delay_estimator_unittest.cc
@ -47,7 +47,7 @@ TEST(EchoPathDelayEstimator, BasicApiCalls) {
  std::vector<std::vector<std::vector<float>>> render(
      kNumBands, std::vector<std::vector<float>>(
                     kNumChannels, std::vector<float>(kBlockSize)));
-  std::vector<float> capture(kBlockSize);
+  std::vector<std::vector<float>> capture(1, std::vector<float>(kBlockSize));
  for (size_t k = 0; k < 100; ++k) {
    render_delay_buffer->Insert(render);
    estimator.EstimateDelay(render_delay_buffer->GetDownsampledRenderBuffer(),
@ -66,7 +66,7 @@ TEST(EchoPathDelayEstimator, DelayEstimation) {
  std::vector<std::vector<std::vector<float>>> render(
      kNumBands, std::vector<std::vector<float>>(
                     kNumChannels, std::vector<float>(kBlockSize)));
-  std::vector<float> capture(kBlockSize);
+  std::vector<std::vector<float>> capture(1, std::vector<float>(kBlockSize));
  ApmDataDumper data_dumper(0);
  constexpr size_t kDownSamplingFactors[] = {2, 4, 8};
  for (auto down_sampling_factor : kDownSamplingFactors) {
@ -83,7 +83,7 @@ TEST(EchoPathDelayEstimator, DelayEstimation) {
      absl::optional<DelayEstimate> estimated_delay_samples;
      for (size_t k = 0; k < (500 + (delay_samples) / kBlockSize); ++k) {
        RandomizeSampleVector(&random_generator, render[0][0]);
-        signal_delay_buffer.Delay(render[0][0], capture);
+        signal_delay_buffer.Delay(render[0][0], capture[0]);
        render_delay_buffer->Insert(render);

        if (k == 0) {
@ -125,7 +125,7 @@ TEST(EchoPathDelayEstimator, NoDelayEstimatesForLowLevelRenderSignals) {
  std::vector<std::vector<std::vector<float>>> render(
      kNumBands, std::vector<std::vector<float>>(
                     kNumChannels, std::vector<float>(kBlockSize)));
-  std::vector<float> capture(kBlockSize);
+  std::vector<std::vector<float>> capture(1, std::vector<float>(kBlockSize));
  ApmDataDumper data_dumper(0);
  EchoPathDelayEstimator estimator(&data_dumper, config);
  std::unique_ptr<RenderDelayBuffer> render_delay_buffer(
@ -136,7 +136,7 @@ TEST(EchoPathDelayEstimator, NoDelayEstimatesForLowLevelRenderSignals) {
    for (auto& render_k : render[0][0]) {
      render_k *= 100.f / 32767.f;
    }
-    std::copy(render[0][0].begin(), render[0][0].end(), capture.begin());
+    std::copy(render[0][0].begin(), render[0][0].end(), capture[0].begin());
    render_delay_buffer->Insert(render);
    render_delay_buffer->PrepareCaptureProcessing();
    EXPECT_FALSE(estimator.EstimateDelay(
@ -155,7 +155,7 @@ TEST(EchoPathDelayEstimator, DISABLED_WrongRenderBlockSize) {
  EchoPathDelayEstimator estimator(&data_dumper, config);
  std::unique_ptr<RenderDelayBuffer> render_delay_buffer(
      RenderDelayBuffer::Create(config, 48000, 1));
-  std::vector<float> capture(kBlockSize);
+  std::vector<std::vector<float>> capture(1, std::vector<float>(kBlockSize));
  EXPECT_DEATH(estimator.EstimateDelay(
                   render_delay_buffer->GetDownsampledRenderBuffer(), capture),
               "");
@ -170,7 +170,8 @@ TEST(EchoPathDelayEstimator, WrongCaptureBlockSize) {
  EchoPathDelayEstimator estimator(&data_dumper, config);
  std::unique_ptr<RenderDelayBuffer> render_delay_buffer(
      RenderDelayBuffer::Create(config, 48000, 1));
-  std::vector<float> capture(std::vector<float>(kBlockSize - 1));
+  std::vector<std::vector<float>> capture(1,
+                                          std::vector<float>(kBlockSize - 1));
  EXPECT_DEATH(estimator.EstimateDelay(
                   render_delay_buffer->GetDownsampledRenderBuffer(), capture),
               "");
--- a/modules/audio_processing/aec3/matched_filter_unittest.cc
+++ b/modules/audio_processing/aec3/matched_filter_unittest.cc
@ -150,8 +150,8 @@ TEST(MatchedFilter, LagEstimation) {
    std::vector<std::vector<std::vector<float>>> render(
        kNumBands, std::vector<std::vector<float>>(
                       kNumChannels, std::vector<float>(kBlockSize, 0.f)));
-    std::array<float, kBlockSize> capture;
-    capture.fill(0.f);
+    std::vector<std::vector<float>> capture(
+        1, std::vector<float>(kBlockSize, 0.f));
    ApmDataDumper data_dumper(0);
    for (size_t delay_samples : {5, 64, 150, 200, 800, 1000}) {
      SCOPED_TRACE(ProduceDebugText(delay_samples, down_sampling_factor));
@ -177,7 +177,7 @@ TEST(MatchedFilter, LagEstimation) {
            RandomizeSampleVector(&random_generator, render[band][channel]);
          }
        }
-        signal_delay_buffer.Delay(render[0][0], capture);
+        signal_delay_buffer.Delay(render[0][0], capture[0]);
        render_delay_buffer->Insert(render);

        if (k == 0) {
@ -188,7 +188,7 @@ TEST(MatchedFilter, LagEstimation) {
        std::array<float, kBlockSize> downsampled_capture_data;
        rtc::ArrayView<float> downsampled_capture(
            downsampled_capture_data.data(), sub_block_size);
-        capture_decimator.Decimate(capture, downsampled_capture);
+        capture_decimator.Decimate(capture, true, downsampled_capture);
        filter.Update(render_delay_buffer->GetDownsampledRenderBuffer(),
                      downsampled_capture);
      }
@ -312,8 +312,8 @@ TEST(MatchedFilter, LagNotUpdatedForLowLevelRender) {
    std::vector<std::vector<std::vector<float>>> render(
        kNumBands, std::vector<std::vector<float>>(
                       kNumChannels, std::vector<float>(kBlockSize, 0.f)));
-    std::array<float, kBlockSize> capture;
-    capture.fill(0.f);
+    std::vector<std::vector<float>> capture(
+        1, std::vector<float>(kBlockSize, 0.f));
    ApmDataDumper data_dumper(0);
    EchoCanceller3Config config;
    MatchedFilter filter(&data_dumper, DetectOptimization(), sub_block_size,
@ -332,11 +332,11 @@ TEST(MatchedFilter, LagNotUpdatedForLowLevelRender) {
      for (auto& render_k : render[0][0]) {
        render_k *= 149.f / 32767.f;
      }
-      std::copy(render[0][0].begin(), render[0][0].end(), capture.begin());
+      std::copy(render[0][0].begin(), render[0][0].end(), capture[0].begin());
      std::array<float, kBlockSize> downsampled_capture_data;
      rtc::ArrayView<float> downsampled_capture(downsampled_capture_data.data(),
                                                sub_block_size);
-      capture_decimator.Decimate(capture, downsampled_capture);
+      capture_decimator.Decimate(capture, true, downsampled_capture);
      filter.Update(render_delay_buffer->GetDownsampledRenderBuffer(),
                    downsampled_capture);
    }
--- a/modules/audio_processing/aec3/mock/mock_render_delay_controller.h
+++ b/modules/audio_processing/aec3/mock/mock_render_delay_controller.h
@ -31,7 +31,7 @@ class MockRenderDelayController : public RenderDelayController {
               absl::optional<DelayEstimate>(
                   const DownsampledRenderBuffer& render_buffer,
                   size_t render_delay_buffer_delay,
-                   rtc::ArrayView<const float> capture));
+                   const std::vector<std::vector<float>>& capture));
  MOCK_CONST_METHOD0(HasClockdrift, bool());
 };

--- a/modules/audio_processing/aec3/render_delay_buffer.cc
+++ b/modules/audio_processing/aec3/render_delay_buffer.cc
@ -377,9 +377,8 @@ void RenderDelayBufferImpl::InsertBlock(
    std::copy(block[k].begin(), block[k].end(), b.buffer[b.write][k].begin());
  }

-  data_dumper_->DumpWav("aec3_render_decimator_input", block[0][0].size(),
-                        block[0][0].data(), 16000, 1);
-  render_decimator_.Decimate(block[0][0], ds);
+  render_decimator_.Decimate(block[0],
+                             config_.delay.downmix_before_delay_estimation, ds);
  data_dumper_->DumpWav("aec3_render_decimator_output", ds.size(), ds.data(),
                        16000 / down_sampling_factor_, 1);
  std::copy(ds.rbegin(), ds.rend(), lr.buffer.begin() + lr.write);
--- a/modules/audio_processing/aec3/render_delay_controller.cc
+++ b/modules/audio_processing/aec3/render_delay_controller.cc
@ -41,7 +41,7 @@ class RenderDelayControllerImpl final : public RenderDelayController {
  absl::optional<DelayEstimate> GetDelay(
      const DownsampledRenderBuffer& render_buffer,
      size_t render_delay_buffer_delay,
-      rtc::ArrayView<const float> capture) override;
+      const std::vector<std::vector<float>>& capture) override;
  bool HasClockdrift() const override;

 private:
@ -118,8 +118,8 @@ void RenderDelayControllerImpl::LogRenderCall() {}
 absl::optional<DelayEstimate> RenderDelayControllerImpl::GetDelay(
    const DownsampledRenderBuffer& render_buffer,
    size_t render_delay_buffer_delay,
-    rtc::ArrayView<const float> capture) {
-  RTC_DCHECK_EQ(kBlockSize, capture.size());
+    const std::vector<std::vector<float>>& capture) {
+  RTC_DCHECK_EQ(kBlockSize, capture[0].size());
  ++capture_call_counter_;

  auto delay_samples = delay_estimator_.EstimateDelay(render_buffer, capture);
--- a/modules/audio_processing/aec3/render_delay_controller.h
+++ b/modules/audio_processing/aec3/render_delay_controller.h
@ -39,7 +39,7 @@ class RenderDelayController {
  virtual absl::optional<DelayEstimate> GetDelay(
      const DownsampledRenderBuffer& render_buffer,
      size_t render_delay_buffer_delay,
-      rtc::ArrayView<const float> capture) = 0;
+      const std::vector<std::vector<float>>& capture) = 0;

  // Returns true if clockdrift has been detected.
  virtual bool HasClockdrift() const = 0;
--- a/modules/audio_processing/aec3/render_delay_controller_unittest.cc
+++ b/modules/audio_processing/aec3/render_delay_controller_unittest.cc
@ -46,7 +46,7 @@ constexpr size_t kDownSamplingFactors[] = {2, 4, 8};

 // Verifies the output of GetDelay when there are no AnalyzeRender calls.
 TEST(RenderDelayController, NoRenderSignal) {
-  std::vector<float> block(kBlockSize, 0.f);
+  std::vector<std::vector<float>> block(1, std::vector<float>(kBlockSize, 0.f));
  EchoCanceller3Config config;
  for (size_t num_matched_filters = 4; num_matched_filters == 10;
       num_matched_filters++) {
@ -73,7 +73,8 @@ TEST(RenderDelayController, NoRenderSignal) {
 // Verifies the basic API call sequence.
 TEST(RenderDelayController, BasicApiCalls) {
  constexpr size_t kNumChannels = 1;
-  std::vector<float> capture_block(kBlockSize, 0.f);
+  std::vector<std::vector<float>> capture_block(
+      1, std::vector<float>(kBlockSize, 0.f));
  absl::optional<DelayEstimate> delay_blocks;
  for (size_t num_matched_filters = 4; num_matched_filters == 10;
       num_matched_filters++) {
@ -109,7 +110,8 @@ TEST(RenderDelayController, BasicApiCalls) {
 // simple timeshifts between the signals.
 TEST(RenderDelayController, Alignment) {
  Random random_generator(42U);
-  std::vector<float> capture_block(kBlockSize, 0.f);
+  std::vector<std::vector<float>> capture_block(
+      1, std::vector<float>(kBlockSize, 0.f));
  for (size_t num_matched_filters = 4; num_matched_filters == 10;
       num_matched_filters++) {
    for (auto down_sampling_factor : kDownSamplingFactors) {
@ -140,7 +142,7 @@ TEST(RenderDelayController, Alignment) {
                                        render_block[band][channel]);
                }
              }
-              signal_delay_buffer.Delay(render_block[0][0], capture_block);
+              signal_delay_buffer.Delay(render_block[0][0], capture_block[0]);
              render_delay_buffer->Insert(render_block);
              render_delay_buffer->PrepareCaptureProcessing();
              delay_blocks = delay_controller->GetDelay(
@ -200,7 +202,7 @@ TEST(RenderDelayController, NonCausalAlignment) {
            render_delay_buffer->PrepareCaptureProcessing();
            delay_blocks = delay_controller->GetDelay(
                render_delay_buffer->GetDownsampledRenderBuffer(),
-                render_delay_buffer->Delay(), capture_block[0][0]);
+                render_delay_buffer->Delay(), capture_block[0]);
          }

          ASSERT_FALSE(delay_blocks);
@ -215,7 +217,8 @@ TEST(RenderDelayController, NonCausalAlignment) {
 TEST(RenderDelayController, AlignmentWithJitter) {
  Random random_generator(42U);
  constexpr size_t kNumRenderChannels = 1;
-  std::vector<float> capture_block(kBlockSize, 0.f);
+  std::vector<std::vector<float>> capture_block(
+      1, std::vector<float>(kBlockSize, 0.f));
  for (size_t num_matched_filters = 4; num_matched_filters == 10;
       num_matched_filters++) {
    for (auto down_sampling_factor : kDownSamplingFactors) {
@ -240,10 +243,10 @@ TEST(RenderDelayController, AlignmentWithJitter) {
               j <
               (1000 + delay_samples / kBlockSize) / kMaxTestJitterBlocks + 1;
               ++j) {
-            std::vector<std::vector<float>> capture_block_buffer;
+            std::vector<std::vector<std::vector<float>>> capture_block_buffer;
            for (size_t k = 0; k < (kMaxTestJitterBlocks - 1); ++k) {
              RandomizeSampleVector(&random_generator, render_block[0][0]);
-              signal_delay_buffer.Delay(render_block[0][0], capture_block);
+              signal_delay_buffer.Delay(render_block[0][0], capture_block[0]);
              capture_block_buffer.push_back(capture_block);
              render_delay_buffer->Insert(render_block);
            }
@ -297,7 +300,8 @@ TEST(RenderDelayController, InitialHeadroom) {

 // Verifies the check for the capture signal block size.
 TEST(RenderDelayController, WrongCaptureSize) {
-  std::vector<float> block(kBlockSize - 1, 0.f);
+  std::vector<std::vector<float>> block(
+      1, std::vector<float>(kBlockSize - 1, 0.f));
  EchoCanceller3Config config;
  for (auto rate : {16000, 32000, 48000}) {
    SCOPED_TRACE(ProduceDebugText(rate));