Revert "Reland "Remove unused APM voice activity detection sub-module""
This reverts commit 54d1344d985b00d4d1580dd18057d4618c11ad1f. Reason for revert: Breaks chromium roll, see https://ci.chromium.org/ui/p/chromium/builders/try/linux_chromium_tsan_rel_ng/1080583/overview https://chromium-review.googlesource.com/c/chromium/src/+/3461512 Original change's description: > Reland "Remove unused APM voice activity detection sub-module" > > This reverts commit a751f167c68343f76528436defdbc61600a8d7b3. > > Reason for revert: dependency in a downstream project removed > > Original change's description: > > Revert "Remove unused APM voice activity detection sub-module" > > > > This reverts commit b4e06d032e6f82a65c52ed0c5364ae9e7c0a0215. > > > > Reason for revert: breaking downstream projects > > > > Original change's description: > > > Remove unused APM voice activity detection sub-module > > > > > > API changes: > > > - webrtc::AudioProcessing::Config::VoiceDetection removed > > > - webrtc::AudioProcessingStats::voice_detected deprecated > > > - cricket::AudioOptions::typing_detection deprecated > > > - webrtc::StatsReport::StatsValueName:: > > > kStatsValueNameTypingNoiseState deprecated > > > > > > PSA: https://groups.google.com/g/discuss-webrtc/c/7X6uwmJarE0 > > > > > > Bug: webrtc:11226,webrtc:11292 > > > Change-Id: I8d008b56708cf62961b9857ec052b59fda3b41bf > > > Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/250666 > > > Reviewed-by: Harald Alvestrand <hta@webrtc.org> > > > Reviewed-by: Gustaf Ullberg <gustaf@webrtc.org> > > > Reviewed-by: Sam Zackrisson <saza@webrtc.org> > > > Reviewed-by: Björn Terelius <terelius@webrtc.org> > > > Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> > > > Cr-Commit-Position: refs/heads/main@{#35975} > > > > TBR=gustaf@webrtc.org,saza@webrtc.org,alessiob@webrtc.org,terelius@webrtc.org,hta@webrtc.org,webrtc-scoped@luci-project-accounts.iam.gserviceaccount.com > > > > Change-Id: Iee01fdb874b4e0331277f3ffe60dacaabc3859a2 > > No-Presubmit: true > > No-Tree-Checks: true > > No-Try: true > > Bug: webrtc:11226,webrtc:11292 > > Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/251600 > > Reviewed-by: Harald Alvestrand <hta@webrtc.org> > > Reviewed-by: Gustaf Ullberg <gustaf@webrtc.org> > > Commit-Queue: Mirko Bonadei <mbonadei@webrtc.org> > > Cr-Commit-Position: refs/heads/main@{#35977} > > # Not skipping CQ checks because this is a reland. > > Bug: webrtc:11226,webrtc:11292 > Change-Id: I2fcbc5fdade16bfe6a0f0a02841a33a598d4f2ad > Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/251660 > Reviewed-by: Alessio Bazzica <alessiob@webrtc.org> > Reviewed-by: Harald Alvestrand <hta@webrtc.org> > Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> > Cr-Commit-Position: refs/heads/main@{#35984} TBR=mbonadei@webrtc.org,gustaf@webrtc.org,saza@webrtc.org,alessiob@webrtc.org,terelius@webrtc.org,hta@webrtc.org,webrtc-scoped@luci-project-accounts.iam.gserviceaccount.com Change-Id: Ib308a3af2dcce85a0074ef5a4680ccec3f82712f No-Presubmit: true No-Tree-Checks: true No-Try: true Bug: webrtc:11226,webrtc:11292 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/251688 Reviewed-by: Henrik Boström <hbos@webrtc.org> Bot-Commit: rubber-stamper@appspot.gserviceaccount.com <rubber-stamper@appspot.gserviceaccount.com> Auto-Submit: Henrik Boström <hbos@webrtc.org> Reviewed-by: Harald Alvestrand <hta@webrtc.org> Commit-Queue: Harald Alvestrand <hta@webrtc.org> Cr-Commit-Position: refs/heads/main@{#35990}
This commit is contained in:

committed by
WebRTC LUCI CQ

parent
eb6c6fcf27
commit
09aaf6f7bc
@ -60,8 +60,6 @@ struct RTC_EXPORT AudioOptions {
|
|||||||
absl::optional<int> audio_jitter_buffer_min_delay_ms;
|
absl::optional<int> audio_jitter_buffer_min_delay_ms;
|
||||||
// Audio receiver jitter buffer (NetEq) should handle retransmitted packets.
|
// Audio receiver jitter buffer (NetEq) should handle retransmitted packets.
|
||||||
absl::optional<bool> audio_jitter_buffer_enable_rtx_handling;
|
absl::optional<bool> audio_jitter_buffer_enable_rtx_handling;
|
||||||
// Deprecated.
|
|
||||||
// TODO(bugs.webrtc.org/11226): Remove.
|
|
||||||
// Audio processing to detect typing.
|
// Audio processing to detect typing.
|
||||||
absl::optional<bool> typing_detection;
|
absl::optional<bool> typing_detection;
|
||||||
// TODO(bugs.webrtc.org/11539): Deprecated, replaced by
|
// TODO(bugs.webrtc.org/11539): Deprecated, replaced by
|
||||||
|
@ -648,7 +648,6 @@ const char* StatsReport::Value::display_name() const {
|
|||||||
return "googTrackId";
|
return "googTrackId";
|
||||||
case kStatsValueNameTimingFrameInfo:
|
case kStatsValueNameTimingFrameInfo:
|
||||||
return "googTimingFrameInfo";
|
return "googTimingFrameInfo";
|
||||||
// TODO(bugs.webrtc.org/11226): Remove.
|
|
||||||
case kStatsValueNameTypingNoiseState:
|
case kStatsValueNameTypingNoiseState:
|
||||||
return "googTypingNoiseState";
|
return "googTypingNoiseState";
|
||||||
case kStatsValueNameWritable:
|
case kStatsValueNameWritable:
|
||||||
|
@ -235,7 +235,6 @@ class RTC_EXPORT StatsReport {
|
|||||||
kStatsValueNameTrackId,
|
kStatsValueNameTrackId,
|
||||||
kStatsValueNameTransmitBitrate,
|
kStatsValueNameTransmitBitrate,
|
||||||
kStatsValueNameTransportType,
|
kStatsValueNameTransportType,
|
||||||
// TODO(bugs.webrtc.org/11226): Remove.
|
|
||||||
kStatsValueNameTypingNoiseState,
|
kStatsValueNameTypingNoiseState,
|
||||||
kStatsValueNameWritable,
|
kStatsValueNameWritable,
|
||||||
kStatsValueNameAudioDeviceUnderrunCounter,
|
kStatsValueNameAudioDeviceUnderrunCounter,
|
||||||
|
@ -165,6 +165,24 @@ int32_t AudioTransportImpl::RecordedDataIsAvailable(
|
|||||||
audio_frame.get());
|
audio_frame.get());
|
||||||
audio_frame->set_absolute_capture_timestamp_ms(estimated_capture_time_ns /
|
audio_frame->set_absolute_capture_timestamp_ms(estimated_capture_time_ns /
|
||||||
1000000);
|
1000000);
|
||||||
|
// Typing detection (utilizes the APM/VAD decision). We let the VAD determine
|
||||||
|
// if we're using this feature or not.
|
||||||
|
// TODO(solenberg): GetConfig() takes a lock. Work around that.
|
||||||
|
bool typing_detected = false;
|
||||||
|
if (audio_processing_ &&
|
||||||
|
audio_processing_->GetConfig().voice_detection.enabled) {
|
||||||
|
if (audio_frame->vad_activity_ != AudioFrame::kVadUnknown) {
|
||||||
|
bool vad_active = audio_frame->vad_activity_ == AudioFrame::kVadActive;
|
||||||
|
typing_detected = typing_detection_.Process(key_pressed, vad_active);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy frame and push to each sending stream. The copy is required since an
|
||||||
|
// encoding task will be posted internally to each stream.
|
||||||
|
{
|
||||||
|
MutexLock lock(&capture_lock_);
|
||||||
|
typing_noise_detected_ = typing_detected;
|
||||||
|
}
|
||||||
|
|
||||||
RTC_DCHECK_GT(audio_frame->samples_per_channel_, 0);
|
RTC_DCHECK_GT(audio_frame->samples_per_channel_, 0);
|
||||||
if (async_audio_processing_)
|
if (async_audio_processing_)
|
||||||
@ -272,4 +290,8 @@ void AudioTransportImpl::SetStereoChannelSwapping(bool enable) {
|
|||||||
swap_stereo_channels_ = enable;
|
swap_stereo_channels_ = enable;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool AudioTransportImpl::typing_noise_detected() const {
|
||||||
|
MutexLock lock(&capture_lock_);
|
||||||
|
return typing_noise_detected_;
|
||||||
|
}
|
||||||
} // namespace webrtc
|
} // namespace webrtc
|
||||||
|
@ -86,9 +86,7 @@ class AudioTransportImpl : public AudioTransport {
|
|||||||
int send_sample_rate_hz,
|
int send_sample_rate_hz,
|
||||||
size_t send_num_channels);
|
size_t send_num_channels);
|
||||||
void SetStereoChannelSwapping(bool enable);
|
void SetStereoChannelSwapping(bool enable);
|
||||||
// Deprecated.
|
bool typing_noise_detected() const;
|
||||||
// TODO(bugs.webrtc.org/11226): Remove.
|
|
||||||
bool typing_noise_detected() const { return false; }
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void SendProcessedData(std::unique_ptr<AudioFrame> audio_frame);
|
void SendProcessedData(std::unique_ptr<AudioFrame> audio_frame);
|
||||||
@ -105,6 +103,7 @@ class AudioTransportImpl : public AudioTransport {
|
|||||||
std::vector<AudioSender*> audio_senders_ RTC_GUARDED_BY(capture_lock_);
|
std::vector<AudioSender*> audio_senders_ RTC_GUARDED_BY(capture_lock_);
|
||||||
int send_sample_rate_hz_ RTC_GUARDED_BY(capture_lock_) = 8000;
|
int send_sample_rate_hz_ RTC_GUARDED_BY(capture_lock_) = 8000;
|
||||||
size_t send_num_channels_ RTC_GUARDED_BY(capture_lock_) = 1;
|
size_t send_num_channels_ RTC_GUARDED_BY(capture_lock_) = 1;
|
||||||
|
bool typing_noise_detected_ RTC_GUARDED_BY(capture_lock_) = false;
|
||||||
bool swap_stereo_channels_ RTC_GUARDED_BY(capture_lock_) = false;
|
bool swap_stereo_channels_ RTC_GUARDED_BY(capture_lock_) = false;
|
||||||
PushResampler<int16_t> capture_resampler_;
|
PushResampler<int16_t> capture_resampler_;
|
||||||
TypingDetection typing_detection_;
|
TypingDetection typing_detection_;
|
||||||
|
@ -609,7 +609,9 @@ bool WebRtcVoiceEngine::ApplyOptions(const AudioOptions& options_in) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (options.typing_detection) {
|
if (options.typing_detection) {
|
||||||
RTC_LOG(LS_WARNING) << "Typing detection is requested, but unsupported.";
|
RTC_LOG(LS_INFO) << "Typing detection is enabled? "
|
||||||
|
<< *options.typing_detection;
|
||||||
|
apm_config.voice_detection.enabled = *options.typing_detection;
|
||||||
}
|
}
|
||||||
|
|
||||||
ap->ApplyConfig(apm_config);
|
ap->ApplyConfig(apm_config);
|
||||||
|
@ -221,6 +221,11 @@ class WebRtcVoiceEngineTestFake : public ::testing::TestWithParam<bool> {
|
|||||||
// Default Options.
|
// Default Options.
|
||||||
VerifyEchoCancellationSettings(/*enabled=*/true);
|
VerifyEchoCancellationSettings(/*enabled=*/true);
|
||||||
EXPECT_TRUE(IsHighPassFilterEnabled());
|
EXPECT_TRUE(IsHighPassFilterEnabled());
|
||||||
|
#if defined(WEBRTC_ANDROID)
|
||||||
|
EXPECT_FALSE(IsTypingDetectionEnabled());
|
||||||
|
#else
|
||||||
|
EXPECT_TRUE(IsTypingDetectionEnabled());
|
||||||
|
#endif
|
||||||
EXPECT_TRUE(apm_config_.noise_suppression.enabled);
|
EXPECT_TRUE(apm_config_.noise_suppression.enabled);
|
||||||
EXPECT_EQ(apm_config_.noise_suppression.level, kDefaultNsLevel);
|
EXPECT_EQ(apm_config_.noise_suppression.level, kDefaultNsLevel);
|
||||||
VerifyGainControlEnabledCorrectly();
|
VerifyGainControlEnabledCorrectly();
|
||||||
@ -788,6 +793,10 @@ class WebRtcVoiceEngineTestFake : public ::testing::TestWithParam<bool> {
|
|||||||
return apm_config_.high_pass_filter.enabled;
|
return apm_config_.high_pass_filter.enabled;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool IsTypingDetectionEnabled() {
|
||||||
|
return apm_config_.voice_detection.enabled;
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
const bool use_null_apm_;
|
const bool use_null_apm_;
|
||||||
std::unique_ptr<webrtc::TaskQueueFactory> task_queue_factory_;
|
std::unique_ptr<webrtc::TaskQueueFactory> task_queue_factory_;
|
||||||
@ -2980,10 +2989,40 @@ TEST_P(WebRtcVoiceEngineTestFake, SetAudioOptions) {
|
|||||||
if (!use_null_apm_) {
|
if (!use_null_apm_) {
|
||||||
VerifyEchoCancellationSettings(/*enabled=*/true);
|
VerifyEchoCancellationSettings(/*enabled=*/true);
|
||||||
EXPECT_TRUE(IsHighPassFilterEnabled());
|
EXPECT_TRUE(IsHighPassFilterEnabled());
|
||||||
|
#if defined(WEBRTC_ANDROID)
|
||||||
|
EXPECT_FALSE(IsTypingDetectionEnabled());
|
||||||
|
#else
|
||||||
|
EXPECT_TRUE(IsTypingDetectionEnabled());
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
EXPECT_EQ(200u, GetRecvStreamConfig(kSsrcY).jitter_buffer_max_packets);
|
EXPECT_EQ(200u, GetRecvStreamConfig(kSsrcY).jitter_buffer_max_packets);
|
||||||
EXPECT_FALSE(GetRecvStreamConfig(kSsrcY).jitter_buffer_fast_accelerate);
|
EXPECT_FALSE(GetRecvStreamConfig(kSsrcY).jitter_buffer_fast_accelerate);
|
||||||
|
|
||||||
|
// Turn typing detection off.
|
||||||
|
send_parameters_.options.typing_detection = false;
|
||||||
|
SetSendParameters(send_parameters_);
|
||||||
|
if (!use_null_apm_) {
|
||||||
|
EXPECT_FALSE(IsTypingDetectionEnabled());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Leave typing detection unchanged, but non-default.
|
||||||
|
send_parameters_.options.typing_detection = absl::nullopt;
|
||||||
|
SetSendParameters(send_parameters_);
|
||||||
|
if (!use_null_apm_) {
|
||||||
|
EXPECT_FALSE(IsTypingDetectionEnabled());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Turn typing detection on.
|
||||||
|
send_parameters_.options.typing_detection = true;
|
||||||
|
SetSendParameters(send_parameters_);
|
||||||
|
if (!use_null_apm_) {
|
||||||
|
#if defined(WEBRTC_ANDROID)
|
||||||
|
EXPECT_FALSE(IsTypingDetectionEnabled());
|
||||||
|
#else
|
||||||
|
EXPECT_TRUE(IsTypingDetectionEnabled());
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
// Turn echo cancellation off
|
// Turn echo cancellation off
|
||||||
send_parameters_.options.echo_cancellation = false;
|
send_parameters_.options.echo_cancellation = false;
|
||||||
SetSendParameters(send_parameters_);
|
SetSendParameters(send_parameters_);
|
||||||
|
@ -168,6 +168,7 @@ rtc_library("audio_processing") {
|
|||||||
":high_pass_filter",
|
":high_pass_filter",
|
||||||
":optionally_built_submodule_creators",
|
":optionally_built_submodule_creators",
|
||||||
":rms_level",
|
":rms_level",
|
||||||
|
":voice_detection",
|
||||||
"../../api:array_view",
|
"../../api:array_view",
|
||||||
"../../api:function_view",
|
"../../api:function_view",
|
||||||
"../../api/audio:aec3_config",
|
"../../api/audio:aec3_config",
|
||||||
@ -217,6 +218,20 @@ rtc_library("audio_processing") {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
rtc_library("voice_detection") {
|
||||||
|
sources = [
|
||||||
|
"voice_detection.cc",
|
||||||
|
"voice_detection.h",
|
||||||
|
]
|
||||||
|
deps = [
|
||||||
|
":api",
|
||||||
|
":audio_buffer",
|
||||||
|
"../../api/audio:audio_frame_api",
|
||||||
|
"../../common_audio:common_audio_c",
|
||||||
|
"../../rtc_base:checks",
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
rtc_library("residual_echo_detector") {
|
rtc_library("residual_echo_detector") {
|
||||||
poisonous = [ "default_echo_detector" ]
|
poisonous = [ "default_echo_detector" ]
|
||||||
configs += [ ":apm_debug_dump" ]
|
configs += [ ":apm_debug_dump" ]
|
||||||
@ -364,6 +379,7 @@ if (rtc_include_tests) {
|
|||||||
":gain_controller2",
|
":gain_controller2",
|
||||||
":high_pass_filter",
|
":high_pass_filter",
|
||||||
":mocks",
|
":mocks",
|
||||||
|
":voice_detection",
|
||||||
"../../api:array_view",
|
"../../api:array_view",
|
||||||
"../../api:scoped_refptr",
|
"../../api:scoped_refptr",
|
||||||
"../../api/audio:aec3_config",
|
"../../api/audio:aec3_config",
|
||||||
@ -458,6 +474,7 @@ if (rtc_include_tests) {
|
|||||||
"test/echo_canceller_test_tools_unittest.cc",
|
"test/echo_canceller_test_tools_unittest.cc",
|
||||||
"test/echo_control_mock.h",
|
"test/echo_control_mock.h",
|
||||||
"test/test_utils.h",
|
"test/test_utils.h",
|
||||||
|
"voice_detection_unittest.cc",
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -141,6 +141,7 @@ bool AudioProcessingImpl::SubmoduleStates::Update(
|
|||||||
bool gain_controller2_enabled,
|
bool gain_controller2_enabled,
|
||||||
bool gain_adjustment_enabled,
|
bool gain_adjustment_enabled,
|
||||||
bool echo_controller_enabled,
|
bool echo_controller_enabled,
|
||||||
|
bool voice_detector_enabled,
|
||||||
bool transient_suppressor_enabled) {
|
bool transient_suppressor_enabled) {
|
||||||
bool changed = false;
|
bool changed = false;
|
||||||
changed |= (high_pass_filter_enabled != high_pass_filter_enabled_);
|
changed |= (high_pass_filter_enabled != high_pass_filter_enabled_);
|
||||||
@ -152,6 +153,7 @@ bool AudioProcessingImpl::SubmoduleStates::Update(
|
|||||||
changed |= (gain_controller2_enabled != gain_controller2_enabled_);
|
changed |= (gain_controller2_enabled != gain_controller2_enabled_);
|
||||||
changed |= (gain_adjustment_enabled != gain_adjustment_enabled_);
|
changed |= (gain_adjustment_enabled != gain_adjustment_enabled_);
|
||||||
changed |= (echo_controller_enabled != echo_controller_enabled_);
|
changed |= (echo_controller_enabled != echo_controller_enabled_);
|
||||||
|
changed |= (voice_detector_enabled != voice_detector_enabled_);
|
||||||
changed |= (transient_suppressor_enabled != transient_suppressor_enabled_);
|
changed |= (transient_suppressor_enabled != transient_suppressor_enabled_);
|
||||||
if (changed) {
|
if (changed) {
|
||||||
high_pass_filter_enabled_ = high_pass_filter_enabled;
|
high_pass_filter_enabled_ = high_pass_filter_enabled;
|
||||||
@ -161,6 +163,7 @@ bool AudioProcessingImpl::SubmoduleStates::Update(
|
|||||||
gain_controller2_enabled_ = gain_controller2_enabled;
|
gain_controller2_enabled_ = gain_controller2_enabled;
|
||||||
gain_adjustment_enabled_ = gain_adjustment_enabled;
|
gain_adjustment_enabled_ = gain_adjustment_enabled;
|
||||||
echo_controller_enabled_ = echo_controller_enabled;
|
echo_controller_enabled_ = echo_controller_enabled;
|
||||||
|
voice_detector_enabled_ = voice_detector_enabled;
|
||||||
transient_suppressor_enabled_ = transient_suppressor_enabled;
|
transient_suppressor_enabled_ = transient_suppressor_enabled;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -171,7 +174,7 @@ bool AudioProcessingImpl::SubmoduleStates::Update(
|
|||||||
|
|
||||||
bool AudioProcessingImpl::SubmoduleStates::CaptureMultiBandSubModulesActive()
|
bool AudioProcessingImpl::SubmoduleStates::CaptureMultiBandSubModulesActive()
|
||||||
const {
|
const {
|
||||||
return CaptureMultiBandProcessingPresent();
|
return CaptureMultiBandProcessingPresent() || voice_detector_enabled_;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool AudioProcessingImpl::SubmoduleStates::CaptureMultiBandProcessingPresent()
|
bool AudioProcessingImpl::SubmoduleStates::CaptureMultiBandProcessingPresent()
|
||||||
@ -368,6 +371,7 @@ void AudioProcessingImpl::InitializeLocked() {
|
|||||||
InitializeGainController1();
|
InitializeGainController1();
|
||||||
InitializeTransientSuppressor();
|
InitializeTransientSuppressor();
|
||||||
InitializeHighPassFilter(true);
|
InitializeHighPassFilter(true);
|
||||||
|
InitializeVoiceDetector();
|
||||||
InitializeResidualEchoDetector();
|
InitializeResidualEchoDetector();
|
||||||
InitializeEchoController();
|
InitializeEchoController();
|
||||||
InitializeGainController2(/*config_has_changed=*/true);
|
InitializeGainController2(/*config_has_changed=*/true);
|
||||||
@ -502,6 +506,9 @@ void AudioProcessingImpl::ApplyConfig(const AudioProcessing::Config& config) {
|
|||||||
const bool agc2_config_changed =
|
const bool agc2_config_changed =
|
||||||
config_.gain_controller2 != config.gain_controller2;
|
config_.gain_controller2 != config.gain_controller2;
|
||||||
|
|
||||||
|
const bool voice_detection_config_changed =
|
||||||
|
config_.voice_detection.enabled != config.voice_detection.enabled;
|
||||||
|
|
||||||
const bool ns_config_changed =
|
const bool ns_config_changed =
|
||||||
config_.noise_suppression.enabled != config.noise_suppression.enabled ||
|
config_.noise_suppression.enabled != config.noise_suppression.enabled ||
|
||||||
config_.noise_suppression.level != config.noise_suppression.level;
|
config_.noise_suppression.level != config.noise_suppression.level;
|
||||||
@ -550,6 +557,10 @@ void AudioProcessingImpl::ApplyConfig(const AudioProcessing::Config& config) {
|
|||||||
InitializeCaptureLevelsAdjuster();
|
InitializeCaptureLevelsAdjuster();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (voice_detection_config_changed) {
|
||||||
|
InitializeVoiceDetector();
|
||||||
|
}
|
||||||
|
|
||||||
// Reinitialization must happen after all submodule configuration to avoid
|
// Reinitialization must happen after all submodule configuration to avoid
|
||||||
// additional reinitializations on the next capture / render processing call.
|
// additional reinitializations on the next capture / render processing call.
|
||||||
if (pipeline_config_changed) {
|
if (pipeline_config_changed) {
|
||||||
@ -1204,6 +1215,13 @@ int AudioProcessingImpl::ProcessCaptureStreamLocked() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (config_.voice_detection.enabled) {
|
||||||
|
capture_.stats.voice_detected =
|
||||||
|
submodules_.voice_detector->ProcessCaptureAudio(capture_buffer);
|
||||||
|
} else {
|
||||||
|
capture_.stats.voice_detected = absl::nullopt;
|
||||||
|
}
|
||||||
|
|
||||||
if (submodules_.agc_manager) {
|
if (submodules_.agc_manager) {
|
||||||
submodules_.agc_manager->Process(capture_buffer);
|
submodules_.agc_manager->Process(capture_buffer);
|
||||||
|
|
||||||
@ -1664,7 +1682,7 @@ bool AudioProcessingImpl::UpdateActiveSubmoduleStates() {
|
|||||||
!!submodules_.gain_controller2,
|
!!submodules_.gain_controller2,
|
||||||
config_.pre_amplifier.enabled || config_.capture_level_adjustment.enabled,
|
config_.pre_amplifier.enabled || config_.capture_level_adjustment.enabled,
|
||||||
capture_nonlocked_.echo_controller_enabled,
|
capture_nonlocked_.echo_controller_enabled,
|
||||||
!!submodules_.transient_suppressor);
|
config_.voice_detection.enabled, !!submodules_.transient_suppressor);
|
||||||
}
|
}
|
||||||
|
|
||||||
void AudioProcessingImpl::InitializeTransientSuppressor() {
|
void AudioProcessingImpl::InitializeTransientSuppressor() {
|
||||||
@ -1714,6 +1732,14 @@ void AudioProcessingImpl::InitializeHighPassFilter(bool forced_reset) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void AudioProcessingImpl::InitializeVoiceDetector() {
|
||||||
|
if (config_.voice_detection.enabled) {
|
||||||
|
submodules_.voice_detector = std::make_unique<VoiceDetection>(
|
||||||
|
proc_split_sample_rate_hz(), VoiceDetection::kVeryLowLikelihood);
|
||||||
|
} else {
|
||||||
|
submodules_.voice_detector.reset();
|
||||||
|
}
|
||||||
|
}
|
||||||
void AudioProcessingImpl::InitializeEchoController() {
|
void AudioProcessingImpl::InitializeEchoController() {
|
||||||
bool use_echo_controller =
|
bool use_echo_controller =
|
||||||
echo_control_factory_ ||
|
echo_control_factory_ ||
|
||||||
|
@ -39,6 +39,7 @@
|
|||||||
#include "modules/audio_processing/render_queue_item_verifier.h"
|
#include "modules/audio_processing/render_queue_item_verifier.h"
|
||||||
#include "modules/audio_processing/rms_level.h"
|
#include "modules/audio_processing/rms_level.h"
|
||||||
#include "modules/audio_processing/transient/transient_suppressor.h"
|
#include "modules/audio_processing/transient/transient_suppressor.h"
|
||||||
|
#include "modules/audio_processing/voice_detection.h"
|
||||||
#include "rtc_base/gtest_prod_util.h"
|
#include "rtc_base/gtest_prod_util.h"
|
||||||
#include "rtc_base/ignore_wundef.h"
|
#include "rtc_base/ignore_wundef.h"
|
||||||
#include "rtc_base/swap_queue.h"
|
#include "rtc_base/swap_queue.h"
|
||||||
@ -207,6 +208,7 @@ class AudioProcessingImpl : public AudioProcessing {
|
|||||||
bool gain_controller2_enabled,
|
bool gain_controller2_enabled,
|
||||||
bool gain_adjustment_enabled,
|
bool gain_adjustment_enabled,
|
||||||
bool echo_controller_enabled,
|
bool echo_controller_enabled,
|
||||||
|
bool voice_detector_enabled,
|
||||||
bool transient_suppressor_enabled);
|
bool transient_suppressor_enabled);
|
||||||
bool CaptureMultiBandSubModulesActive() const;
|
bool CaptureMultiBandSubModulesActive() const;
|
||||||
bool CaptureMultiBandProcessingPresent() const;
|
bool CaptureMultiBandProcessingPresent() const;
|
||||||
@ -229,6 +231,7 @@ class AudioProcessingImpl : public AudioProcessing {
|
|||||||
bool gain_controller2_enabled_ = false;
|
bool gain_controller2_enabled_ = false;
|
||||||
bool gain_adjustment_enabled_ = false;
|
bool gain_adjustment_enabled_ = false;
|
||||||
bool echo_controller_enabled_ = false;
|
bool echo_controller_enabled_ = false;
|
||||||
|
bool voice_detector_enabled_ = false;
|
||||||
bool transient_suppressor_enabled_ = false;
|
bool transient_suppressor_enabled_ = false;
|
||||||
bool first_update_ = true;
|
bool first_update_ = true;
|
||||||
};
|
};
|
||||||
@ -264,6 +267,7 @@ class AudioProcessingImpl : public AudioProcessing {
|
|||||||
// already acquired.
|
// already acquired.
|
||||||
void InitializeHighPassFilter(bool forced_reset)
|
void InitializeHighPassFilter(bool forced_reset)
|
||||||
RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
|
RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
|
||||||
|
void InitializeVoiceDetector() RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
|
||||||
void InitializeGainController1() RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
|
void InitializeGainController1() RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
|
||||||
void InitializeTransientSuppressor()
|
void InitializeTransientSuppressor()
|
||||||
RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
|
RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
|
||||||
@ -396,6 +400,7 @@ class AudioProcessingImpl : public AudioProcessing {
|
|||||||
std::unique_ptr<EchoControlMobileImpl> echo_control_mobile;
|
std::unique_ptr<EchoControlMobileImpl> echo_control_mobile;
|
||||||
std::unique_ptr<NoiseSuppressor> noise_suppressor;
|
std::unique_ptr<NoiseSuppressor> noise_suppressor;
|
||||||
std::unique_ptr<TransientSuppressor> transient_suppressor;
|
std::unique_ptr<TransientSuppressor> transient_suppressor;
|
||||||
|
std::unique_ptr<VoiceDetection> voice_detector;
|
||||||
std::unique_ptr<CaptureLevelsAdjuster> capture_levels_adjuster;
|
std::unique_ptr<CaptureLevelsAdjuster> capture_levels_adjuster;
|
||||||
} submodules_;
|
} submodules_;
|
||||||
|
|
||||||
|
@ -483,6 +483,7 @@ AudioProcessing::Config GetApmTestConfig(AecType aec_type) {
|
|||||||
apm_config.gain_controller1.mode =
|
apm_config.gain_controller1.mode =
|
||||||
AudioProcessing::Config::GainController1::kAdaptiveDigital;
|
AudioProcessing::Config::GainController1::kAdaptiveDigital;
|
||||||
apm_config.noise_suppression.enabled = true;
|
apm_config.noise_suppression.enabled = true;
|
||||||
|
apm_config.voice_detection.enabled = true;
|
||||||
return apm_config;
|
return apm_config;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -441,6 +441,7 @@ class CallSimulator : public ::testing::TestWithParam<SimulationConfig> {
|
|||||||
apm_config.gain_controller1.enabled = true;
|
apm_config.gain_controller1.enabled = true;
|
||||||
apm_config.gain_controller1.mode =
|
apm_config.gain_controller1.mode =
|
||||||
AudioProcessing::Config::GainController1::kAdaptiveDigital;
|
AudioProcessing::Config::GainController1::kAdaptiveDigital;
|
||||||
|
apm_config.voice_detection.enabled = true;
|
||||||
apm->ApplyConfig(apm_config);
|
apm->ApplyConfig(apm_config);
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -452,6 +453,7 @@ class CallSimulator : public ::testing::TestWithParam<SimulationConfig> {
|
|||||||
apm_config.noise_suppression.enabled = true;
|
apm_config.noise_suppression.enabled = true;
|
||||||
apm_config.gain_controller1.mode =
|
apm_config.gain_controller1.mode =
|
||||||
AudioProcessing::Config::GainController1::kAdaptiveDigital;
|
AudioProcessing::Config::GainController1::kAdaptiveDigital;
|
||||||
|
apm_config.voice_detection.enabled = true;
|
||||||
apm->ApplyConfig(apm_config);
|
apm->ApplyConfig(apm_config);
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -462,6 +464,7 @@ class CallSimulator : public ::testing::TestWithParam<SimulationConfig> {
|
|||||||
apm_config.echo_canceller.enabled = false;
|
apm_config.echo_canceller.enabled = false;
|
||||||
apm_config.gain_controller1.enabled = false;
|
apm_config.gain_controller1.enabled = false;
|
||||||
apm_config.noise_suppression.enabled = false;
|
apm_config.noise_suppression.enabled = false;
|
||||||
|
apm_config.voice_detection.enabled = false;
|
||||||
apm->ApplyConfig(apm_config);
|
apm->ApplyConfig(apm_config);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -190,6 +190,7 @@ void EnableAllAPComponents(AudioProcessing* ap) {
|
|||||||
apm_config.noise_suppression.enabled = true;
|
apm_config.noise_suppression.enabled = true;
|
||||||
|
|
||||||
apm_config.high_pass_filter.enabled = true;
|
apm_config.high_pass_filter.enabled = true;
|
||||||
|
apm_config.voice_detection.enabled = true;
|
||||||
apm_config.pipeline.maximum_internal_processing_rate = 48000;
|
apm_config.pipeline.maximum_internal_processing_rate = 48000;
|
||||||
ap->ApplyConfig(apm_config);
|
ap->ApplyConfig(apm_config);
|
||||||
}
|
}
|
||||||
@ -1225,6 +1226,7 @@ TEST_F(ApmTest, AllProcessingDisabledByDefault) {
|
|||||||
EXPECT_FALSE(config.high_pass_filter.enabled);
|
EXPECT_FALSE(config.high_pass_filter.enabled);
|
||||||
EXPECT_FALSE(config.gain_controller1.enabled);
|
EXPECT_FALSE(config.gain_controller1.enabled);
|
||||||
EXPECT_FALSE(config.noise_suppression.enabled);
|
EXPECT_FALSE(config.noise_suppression.enabled);
|
||||||
|
EXPECT_FALSE(config.voice_detection.enabled);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(ApmTest, NoProcessingWhenAllComponentsDisabled) {
|
TEST_F(ApmTest, NoProcessingWhenAllComponentsDisabled) {
|
||||||
@ -1365,6 +1367,48 @@ TEST_F(ApmTest, SplittingFilter) {
|
|||||||
EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy));
|
EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy));
|
||||||
apm_->ApplyConfig(apm_config);
|
apm_->ApplyConfig(apm_config);
|
||||||
|
|
||||||
|
// 3. Only GetStatistics-reporting VAD is enabled...
|
||||||
|
SetFrameTo(&frame_, 1000);
|
||||||
|
frame_copy.CopyFrom(frame_);
|
||||||
|
apm_config.voice_detection.enabled = true;
|
||||||
|
apm_->ApplyConfig(apm_config);
|
||||||
|
EXPECT_EQ(apm_->kNoError,
|
||||||
|
apm_->ProcessStream(
|
||||||
|
frame_.data.data(),
|
||||||
|
StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
|
||||||
|
StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
|
||||||
|
frame_.data.data()));
|
||||||
|
EXPECT_EQ(apm_->kNoError,
|
||||||
|
apm_->ProcessStream(
|
||||||
|
frame_.data.data(),
|
||||||
|
StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
|
||||||
|
StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
|
||||||
|
frame_.data.data()));
|
||||||
|
EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy));
|
||||||
|
apm_config.voice_detection.enabled = false;
|
||||||
|
apm_->ApplyConfig(apm_config);
|
||||||
|
|
||||||
|
// 4. The VAD is enabled...
|
||||||
|
SetFrameTo(&frame_, 1000);
|
||||||
|
frame_copy.CopyFrom(frame_);
|
||||||
|
apm_config.voice_detection.enabled = true;
|
||||||
|
apm_->ApplyConfig(apm_config);
|
||||||
|
EXPECT_EQ(apm_->kNoError,
|
||||||
|
apm_->ProcessStream(
|
||||||
|
frame_.data.data(),
|
||||||
|
StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
|
||||||
|
StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
|
||||||
|
frame_.data.data()));
|
||||||
|
EXPECT_EQ(apm_->kNoError,
|
||||||
|
apm_->ProcessStream(
|
||||||
|
frame_.data.data(),
|
||||||
|
StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
|
||||||
|
StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
|
||||||
|
frame_.data.data()));
|
||||||
|
EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy));
|
||||||
|
apm_config.voice_detection.enabled = false;
|
||||||
|
apm_->ApplyConfig(apm_config);
|
||||||
|
|
||||||
// Check the test is valid. We should have distortion from the filter
|
// Check the test is valid. We should have distortion from the filter
|
||||||
// when AEC is enabled (which won't affect the audio).
|
// when AEC is enabled (which won't affect the audio).
|
||||||
apm_config.echo_canceller.enabled = true;
|
apm_config.echo_canceller.enabled = true;
|
||||||
@ -1692,6 +1736,7 @@ TEST_F(ApmTest, Process) {
|
|||||||
static_cast<size_t>(test->num_reverse_channels()), true);
|
static_cast<size_t>(test->num_reverse_channels()), true);
|
||||||
|
|
||||||
int frame_count = 0;
|
int frame_count = 0;
|
||||||
|
int has_voice_count = 0;
|
||||||
int analog_level = 127;
|
int analog_level = 127;
|
||||||
int analog_level_average = 0;
|
int analog_level_average = 0;
|
||||||
int max_output_average = 0;
|
int max_output_average = 0;
|
||||||
@ -1727,6 +1772,8 @@ TEST_F(ApmTest, Process) {
|
|||||||
analog_level = apm_->recommended_stream_analog_level();
|
analog_level = apm_->recommended_stream_analog_level();
|
||||||
analog_level_average += analog_level;
|
analog_level_average += analog_level;
|
||||||
AudioProcessingStats stats = apm_->GetStatistics();
|
AudioProcessingStats stats = apm_->GetStatistics();
|
||||||
|
EXPECT_TRUE(stats.voice_detected);
|
||||||
|
has_voice_count += *stats.voice_detected ? 1 : 0;
|
||||||
|
|
||||||
size_t frame_size = frame_.samples_per_channel * frame_.num_channels;
|
size_t frame_size = frame_.samples_per_channel * frame_.num_channels;
|
||||||
size_t write_count =
|
size_t write_count =
|
||||||
@ -1782,23 +1829,33 @@ TEST_F(ApmTest, Process) {
|
|||||||
|
|
||||||
if (!absl::GetFlag(FLAGS_write_apm_ref_data)) {
|
if (!absl::GetFlag(FLAGS_write_apm_ref_data)) {
|
||||||
const int kIntNear = 1;
|
const int kIntNear = 1;
|
||||||
// All numbers being consistently higher on N7 compare to the reference
|
// When running the test on a N7 we get a {2, 6} difference of
|
||||||
// data.
|
// `has_voice_count` and `max_output_average` is up to 18 higher.
|
||||||
|
// All numbers being consistently higher on N7 compare to ref_data.
|
||||||
// TODO(bjornv): If we start getting more of these offsets on Android we
|
// TODO(bjornv): If we start getting more of these offsets on Android we
|
||||||
// should consider a different approach. Either using one slack for all,
|
// should consider a different approach. Either using one slack for all,
|
||||||
// or generate a separate android reference.
|
// or generate a separate android reference.
|
||||||
#if defined(WEBRTC_ANDROID) || defined(WEBRTC_IOS)
|
#if defined(WEBRTC_ANDROID) || defined(WEBRTC_IOS)
|
||||||
|
const int kHasVoiceCountOffset = 3;
|
||||||
|
const int kHasVoiceCountNear = 8;
|
||||||
const int kMaxOutputAverageOffset = 9;
|
const int kMaxOutputAverageOffset = 9;
|
||||||
const int kMaxOutputAverageNear = 26;
|
const int kMaxOutputAverageNear = 26;
|
||||||
#else
|
#else
|
||||||
|
const int kHasVoiceCountOffset = 0;
|
||||||
|
const int kHasVoiceCountNear = kIntNear;
|
||||||
const int kMaxOutputAverageOffset = 0;
|
const int kMaxOutputAverageOffset = 0;
|
||||||
const int kMaxOutputAverageNear = kIntNear;
|
const int kMaxOutputAverageNear = kIntNear;
|
||||||
#endif
|
#endif
|
||||||
|
EXPECT_NEAR(test->has_voice_count(),
|
||||||
|
has_voice_count - kHasVoiceCountOffset, kHasVoiceCountNear);
|
||||||
|
|
||||||
EXPECT_NEAR(test->analog_level_average(), analog_level_average, kIntNear);
|
EXPECT_NEAR(test->analog_level_average(), analog_level_average, kIntNear);
|
||||||
EXPECT_NEAR(test->max_output_average(),
|
EXPECT_NEAR(test->max_output_average(),
|
||||||
max_output_average - kMaxOutputAverageOffset,
|
max_output_average - kMaxOutputAverageOffset,
|
||||||
kMaxOutputAverageNear);
|
kMaxOutputAverageNear);
|
||||||
} else {
|
} else {
|
||||||
|
test->set_has_voice_count(has_voice_count);
|
||||||
|
|
||||||
test->set_analog_level_average(analog_level_average);
|
test->set_analog_level_average(analog_level_average);
|
||||||
test->set_max_output_average(max_output_average);
|
test->set_max_output_average(max_output_average);
|
||||||
}
|
}
|
||||||
@ -2628,6 +2685,7 @@ rtc::scoped_refptr<AudioProcessing> CreateApm(bool mobile_aec) {
|
|||||||
apm_config.echo_canceller.enabled = true;
|
apm_config.echo_canceller.enabled = true;
|
||||||
apm_config.echo_canceller.mobile_mode = mobile_aec;
|
apm_config.echo_canceller.mobile_mode = mobile_aec;
|
||||||
apm_config.noise_suppression.enabled = false;
|
apm_config.noise_suppression.enabled = false;
|
||||||
|
apm_config.voice_detection.enabled = false;
|
||||||
apm->ApplyConfig(apm_config);
|
apm->ApplyConfig(apm_config);
|
||||||
return apm;
|
return apm;
|
||||||
}
|
}
|
||||||
@ -2736,9 +2794,10 @@ TEST(MAYBE_ApmStatistics, AECMEnabledTest) {
|
|||||||
EXPECT_FALSE(stats.echo_return_loss_enhancement.has_value());
|
EXPECT_FALSE(stats.echo_return_loss_enhancement.has_value());
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(ApmStatistics, DoNotReportVoiceDetectedStat) {
|
TEST(ApmStatistics, ReportHasVoice) {
|
||||||
ProcessingConfig processing_config = {
|
ProcessingConfig processing_config = {
|
||||||
{{32000, 1}, {32000, 1}, {32000, 1}, {32000, 1}}};
|
{{32000, 1}, {32000, 1}, {32000, 1}, {32000, 1}}};
|
||||||
|
AudioProcessing::Config config;
|
||||||
|
|
||||||
// Set up an audioframe.
|
// Set up an audioframe.
|
||||||
Int16FrameData frame;
|
Int16FrameData frame;
|
||||||
@ -2755,14 +2814,37 @@ TEST(ApmStatistics, DoNotReportVoiceDetectedStat) {
|
|||||||
AudioProcessingBuilderForTesting().Create();
|
AudioProcessingBuilderForTesting().Create();
|
||||||
apm->Initialize(processing_config);
|
apm->Initialize(processing_config);
|
||||||
|
|
||||||
// No metric should be reported.
|
// If not enabled, no metric should be reported.
|
||||||
EXPECT_EQ(
|
EXPECT_EQ(
|
||||||
apm->ProcessStream(frame.data.data(),
|
apm->ProcessStream(frame.data.data(),
|
||||||
StreamConfig(frame.sample_rate_hz, frame.num_channels),
|
StreamConfig(frame.sample_rate_hz, frame.num_channels),
|
||||||
StreamConfig(frame.sample_rate_hz, frame.num_channels),
|
StreamConfig(frame.sample_rate_hz, frame.num_channels),
|
||||||
frame.data.data()),
|
frame.data.data()),
|
||||||
0);
|
0);
|
||||||
EXPECT_FALSE(apm->GetStatistics().voice_detected.has_value());
|
EXPECT_FALSE(apm->GetStatistics().voice_detected);
|
||||||
|
|
||||||
|
// If enabled, metrics should be reported.
|
||||||
|
config.voice_detection.enabled = true;
|
||||||
|
apm->ApplyConfig(config);
|
||||||
|
EXPECT_EQ(
|
||||||
|
apm->ProcessStream(frame.data.data(),
|
||||||
|
StreamConfig(frame.sample_rate_hz, frame.num_channels),
|
||||||
|
StreamConfig(frame.sample_rate_hz, frame.num_channels),
|
||||||
|
frame.data.data()),
|
||||||
|
0);
|
||||||
|
auto stats = apm->GetStatistics();
|
||||||
|
EXPECT_TRUE(stats.voice_detected);
|
||||||
|
|
||||||
|
// If re-disabled, the value is again not reported.
|
||||||
|
config.voice_detection.enabled = false;
|
||||||
|
apm->ApplyConfig(config);
|
||||||
|
EXPECT_EQ(
|
||||||
|
apm->ProcessStream(frame.data.data(),
|
||||||
|
StreamConfig(frame.sample_rate_hz, frame.num_channels),
|
||||||
|
StreamConfig(frame.sample_rate_hz, frame.num_channels),
|
||||||
|
frame.data.data()),
|
||||||
|
0);
|
||||||
|
EXPECT_FALSE(apm->GetStatistics().voice_detected);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(ApmStatistics, GetStatisticsReportsNoEchoDetectorStatsWhenDisabled) {
|
TEST(ApmStatistics, GetStatisticsReportsNoEchoDetectorStatsWhenDisabled) {
|
||||||
|
@ -145,6 +145,7 @@ std::string AudioProcessing::Config::ToString() const {
|
|||||||
<< NoiseSuppressionLevelToString(noise_suppression.level)
|
<< NoiseSuppressionLevelToString(noise_suppression.level)
|
||||||
<< " }, transient_suppression: { enabled: "
|
<< " }, transient_suppression: { enabled: "
|
||||||
<< transient_suppression.enabled
|
<< transient_suppression.enabled
|
||||||
|
<< " }, voice_detection: { enabled: " << voice_detection.enabled
|
||||||
<< " }, gain_controller1: { enabled: " << gain_controller1.enabled
|
<< " }, gain_controller1: { enabled: " << gain_controller1.enabled
|
||||||
<< ", mode: " << GainController1ModeToString(gain_controller1.mode)
|
<< ", mode: " << GainController1ModeToString(gain_controller1.mode)
|
||||||
<< ", target_level_dbfs: " << gain_controller1.target_level_dbfs
|
<< ", target_level_dbfs: " << gain_controller1.target_level_dbfs
|
||||||
|
@ -113,6 +113,8 @@ static constexpr int kClippedLevelMin = 70;
|
|||||||
//
|
//
|
||||||
// config.high_pass_filter.enabled = true;
|
// config.high_pass_filter.enabled = true;
|
||||||
//
|
//
|
||||||
|
// config.voice_detection.enabled = true;
|
||||||
|
//
|
||||||
// apm->ApplyConfig(config)
|
// apm->ApplyConfig(config)
|
||||||
//
|
//
|
||||||
// apm->noise_reduction()->set_level(kHighSuppression);
|
// apm->noise_reduction()->set_level(kHighSuppression);
|
||||||
@ -230,6 +232,11 @@ class RTC_EXPORT AudioProcessing : public rtc::RefCountInterface {
|
|||||||
bool enabled = false;
|
bool enabled = false;
|
||||||
} transient_suppression;
|
} transient_suppression;
|
||||||
|
|
||||||
|
// Enables reporting of `voice_detected` in webrtc::AudioProcessingStats.
|
||||||
|
struct VoiceDetection {
|
||||||
|
bool enabled = false;
|
||||||
|
} voice_detection;
|
||||||
|
|
||||||
// Enables automatic gain control (AGC) functionality.
|
// Enables automatic gain control (AGC) functionality.
|
||||||
// The automatic gain control (AGC) component brings the signal to an
|
// The automatic gain control (AGC) component brings the signal to an
|
||||||
// appropriate range. This is done by applying a digital gain directly and,
|
// appropriate range. This is done by applying a digital gain directly and,
|
||||||
|
@ -24,8 +24,6 @@ struct RTC_EXPORT AudioProcessingStats {
|
|||||||
AudioProcessingStats(const AudioProcessingStats& other);
|
AudioProcessingStats(const AudioProcessingStats& other);
|
||||||
~AudioProcessingStats();
|
~AudioProcessingStats();
|
||||||
|
|
||||||
// Deprecated.
|
|
||||||
// TODO(bugs.webrtc.org/11226): Remove.
|
|
||||||
// True if voice is detected in the last capture frame, after processing.
|
// True if voice is detected in the last capture frame, after processing.
|
||||||
// It is conservative in flagging audio as speech, with low likelihood of
|
// It is conservative in flagging audio as speech, with low likelihood of
|
||||||
// incorrectly flagging a frame as voice.
|
// incorrectly flagging a frame as voice.
|
||||||
|
@ -543,6 +543,10 @@ void AudioProcessingSimulator::ConfigureAudioProcessor() {
|
|||||||
apm_config.high_pass_filter.enabled = *settings_.use_hpf;
|
apm_config.high_pass_filter.enabled = *settings_.use_hpf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (settings_.use_vad) {
|
||||||
|
apm_config.voice_detection.enabled = *settings_.use_vad;
|
||||||
|
}
|
||||||
|
|
||||||
if (settings_.use_agc) {
|
if (settings_.use_agc) {
|
||||||
apm_config.gain_controller1.enabled = *settings_.use_agc;
|
apm_config.gain_controller1.enabled = *settings_.use_agc;
|
||||||
}
|
}
|
||||||
|
@ -105,6 +105,7 @@ struct SimulationSettings {
|
|||||||
absl::optional<bool> use_ns;
|
absl::optional<bool> use_ns;
|
||||||
absl::optional<int> use_ts;
|
absl::optional<int> use_ts;
|
||||||
absl::optional<bool> use_analog_agc;
|
absl::optional<bool> use_analog_agc;
|
||||||
|
absl::optional<bool> use_vad;
|
||||||
absl::optional<bool> use_all;
|
absl::optional<bool> use_all;
|
||||||
absl::optional<bool> analog_agc_disable_digital_adaptive;
|
absl::optional<bool> analog_agc_disable_digital_adaptive;
|
||||||
absl::optional<int> agc_mode;
|
absl::optional<int> agc_mode;
|
||||||
|
@ -117,6 +117,10 @@ ABSL_FLAG(int,
|
|||||||
analog_agc,
|
analog_agc,
|
||||||
kParameterNotSpecifiedValue,
|
kParameterNotSpecifiedValue,
|
||||||
"Activate (1) or deactivate (0) the analog AGC");
|
"Activate (1) or deactivate (0) the analog AGC");
|
||||||
|
ABSL_FLAG(int,
|
||||||
|
vad,
|
||||||
|
kParameterNotSpecifiedValue,
|
||||||
|
"Activate (1) or deactivate (0) the voice activity detector");
|
||||||
ABSL_FLAG(bool,
|
ABSL_FLAG(bool,
|
||||||
all_default,
|
all_default,
|
||||||
false,
|
false,
|
||||||
@ -361,6 +365,7 @@ void SetSettingIfFlagSet(int32_t flag, absl::optional<bool>* parameter) {
|
|||||||
SimulationSettings CreateSettings() {
|
SimulationSettings CreateSettings() {
|
||||||
SimulationSettings settings;
|
SimulationSettings settings;
|
||||||
if (absl::GetFlag(FLAGS_all_default)) {
|
if (absl::GetFlag(FLAGS_all_default)) {
|
||||||
|
settings.use_vad = true;
|
||||||
settings.use_ts = true;
|
settings.use_ts = true;
|
||||||
settings.use_analog_agc = true;
|
settings.use_analog_agc = true;
|
||||||
settings.use_ns = true;
|
settings.use_ns = true;
|
||||||
@ -412,6 +417,7 @@ SimulationSettings CreateSettings() {
|
|||||||
SetSettingIfSpecified(absl::GetFlag(FLAGS_ts), &settings.use_ts);
|
SetSettingIfSpecified(absl::GetFlag(FLAGS_ts), &settings.use_ts);
|
||||||
SetSettingIfFlagSet(absl::GetFlag(FLAGS_analog_agc),
|
SetSettingIfFlagSet(absl::GetFlag(FLAGS_analog_agc),
|
||||||
&settings.use_analog_agc);
|
&settings.use_analog_agc);
|
||||||
|
SetSettingIfFlagSet(absl::GetFlag(FLAGS_vad), &settings.use_vad);
|
||||||
SetSettingIfFlagSet(absl::GetFlag(FLAGS_analog_agc_disable_digital_adaptive),
|
SetSettingIfFlagSet(absl::GetFlag(FLAGS_analog_agc_disable_digital_adaptive),
|
||||||
&settings.analog_agc_disable_digital_adaptive);
|
&settings.analog_agc_disable_digital_adaptive);
|
||||||
SetSettingIfSpecified(absl::GetFlag(FLAGS_agc_mode), &settings.agc_mode);
|
SetSettingIfSpecified(absl::GetFlag(FLAGS_agc_mode), &settings.agc_mode);
|
||||||
|
92
modules/audio_processing/voice_detection.cc
Normal file
92
modules/audio_processing/voice_detection.cc
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "modules/audio_processing/voice_detection.h"
|
||||||
|
|
||||||
|
#include "common_audio/vad/include/webrtc_vad.h"
|
||||||
|
#include "modules/audio_processing/audio_buffer.h"
|
||||||
|
#include "rtc_base/checks.h"
|
||||||
|
|
||||||
|
namespace webrtc {
|
||||||
|
class VoiceDetection::Vad {
|
||||||
|
public:
|
||||||
|
Vad() {
|
||||||
|
state_ = WebRtcVad_Create();
|
||||||
|
RTC_CHECK(state_);
|
||||||
|
int error = WebRtcVad_Init(state_);
|
||||||
|
RTC_DCHECK_EQ(0, error);
|
||||||
|
}
|
||||||
|
~Vad() { WebRtcVad_Free(state_); }
|
||||||
|
|
||||||
|
Vad(Vad&) = delete;
|
||||||
|
Vad& operator=(Vad&) = delete;
|
||||||
|
|
||||||
|
VadInst* state() { return state_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
VadInst* state_ = nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
VoiceDetection::VoiceDetection(int sample_rate_hz, Likelihood likelihood)
|
||||||
|
: sample_rate_hz_(sample_rate_hz),
|
||||||
|
frame_size_samples_(static_cast<size_t>(sample_rate_hz_ / 100)),
|
||||||
|
likelihood_(likelihood),
|
||||||
|
vad_(new Vad()) {
|
||||||
|
int mode = 2;
|
||||||
|
switch (likelihood) {
|
||||||
|
case VoiceDetection::kVeryLowLikelihood:
|
||||||
|
mode = 3;
|
||||||
|
break;
|
||||||
|
case VoiceDetection::kLowLikelihood:
|
||||||
|
mode = 2;
|
||||||
|
break;
|
||||||
|
case VoiceDetection::kModerateLikelihood:
|
||||||
|
mode = 1;
|
||||||
|
break;
|
||||||
|
case VoiceDetection::kHighLikelihood:
|
||||||
|
mode = 0;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
RTC_DCHECK_NOTREACHED();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
int error = WebRtcVad_set_mode(vad_->state(), mode);
|
||||||
|
RTC_DCHECK_EQ(0, error);
|
||||||
|
}
|
||||||
|
|
||||||
|
VoiceDetection::~VoiceDetection() {}
|
||||||
|
|
||||||
|
bool VoiceDetection::ProcessCaptureAudio(AudioBuffer* audio) {
|
||||||
|
RTC_DCHECK_GE(AudioBuffer::kMaxSplitFrameLength,
|
||||||
|
audio->num_frames_per_band());
|
||||||
|
std::array<int16_t, AudioBuffer::kMaxSplitFrameLength> mixed_low_pass_data;
|
||||||
|
rtc::ArrayView<const int16_t> mixed_low_pass(mixed_low_pass_data.data(),
|
||||||
|
audio->num_frames_per_band());
|
||||||
|
if (audio->num_channels() == 1) {
|
||||||
|
FloatS16ToS16(audio->split_bands_const(0)[kBand0To8kHz],
|
||||||
|
audio->num_frames_per_band(), mixed_low_pass_data.data());
|
||||||
|
} else {
|
||||||
|
const int num_channels = static_cast<int>(audio->num_channels());
|
||||||
|
for (size_t i = 0; i < audio->num_frames_per_band(); ++i) {
|
||||||
|
int32_t value =
|
||||||
|
FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[0][i]);
|
||||||
|
for (int j = 1; j < num_channels; ++j) {
|
||||||
|
value += FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[j][i]);
|
||||||
|
}
|
||||||
|
mixed_low_pass_data[i] = value / num_channels;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int vad_ret = WebRtcVad_Process(vad_->state(), sample_rate_hz_,
|
||||||
|
mixed_low_pass.data(), frame_size_samples_);
|
||||||
|
RTC_DCHECK(vad_ret == 0 || vad_ret == 1);
|
||||||
|
return vad_ret == 0 ? false : true;
|
||||||
|
}
|
||||||
|
} // namespace webrtc
|
59
modules/audio_processing/voice_detection.h
Normal file
59
modules/audio_processing/voice_detection.h
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_
|
||||||
|
#define MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
#include "modules/audio_processing/include/audio_processing.h"
|
||||||
|
|
||||||
|
namespace webrtc {
|
||||||
|
|
||||||
|
class AudioBuffer;
|
||||||
|
|
||||||
|
// The voice activity detection (VAD) component analyzes the stream to
|
||||||
|
// determine if voice is present.
|
||||||
|
class VoiceDetection {
|
||||||
|
public:
|
||||||
|
// Specifies the likelihood that a frame will be declared to contain voice.
|
||||||
|
// A higher value makes it more likely that speech will not be clipped, at
|
||||||
|
// the expense of more noise being detected as voice.
|
||||||
|
enum Likelihood {
|
||||||
|
kVeryLowLikelihood,
|
||||||
|
kLowLikelihood,
|
||||||
|
kModerateLikelihood,
|
||||||
|
kHighLikelihood
|
||||||
|
};
|
||||||
|
|
||||||
|
VoiceDetection(int sample_rate_hz, Likelihood likelihood);
|
||||||
|
~VoiceDetection();
|
||||||
|
|
||||||
|
VoiceDetection(VoiceDetection&) = delete;
|
||||||
|
VoiceDetection& operator=(VoiceDetection&) = delete;
|
||||||
|
|
||||||
|
// Returns true if voice is detected in the current frame.
|
||||||
|
bool ProcessCaptureAudio(AudioBuffer* audio);
|
||||||
|
|
||||||
|
Likelihood likelihood() const { return likelihood_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
class Vad;
|
||||||
|
|
||||||
|
int sample_rate_hz_;
|
||||||
|
size_t frame_size_samples_;
|
||||||
|
Likelihood likelihood_;
|
||||||
|
std::unique_ptr<Vad> vad_;
|
||||||
|
};
|
||||||
|
} // namespace webrtc
|
||||||
|
|
||||||
|
#endif // MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_
|
104
modules/audio_processing/voice_detection_unittest.cc
Normal file
104
modules/audio_processing/voice_detection_unittest.cc
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2016 The WebRTC project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "api/array_view.h"
|
||||||
|
#include "modules/audio_processing/audio_buffer.h"
|
||||||
|
#include "modules/audio_processing/test/audio_buffer_tools.h"
|
||||||
|
#include "modules/audio_processing/test/bitexactness_tools.h"
|
||||||
|
#include "modules/audio_processing/voice_detection.h"
|
||||||
|
#include "test/gtest.h"
|
||||||
|
|
||||||
|
namespace webrtc {
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
const int kNumFramesToProcess = 1000;
|
||||||
|
|
||||||
|
// Process one frame of data and produce the output.
|
||||||
|
bool ProcessOneFrame(int sample_rate_hz,
|
||||||
|
AudioBuffer* audio_buffer,
|
||||||
|
VoiceDetection* voice_detection) {
|
||||||
|
if (sample_rate_hz > AudioProcessing::kSampleRate16kHz) {
|
||||||
|
audio_buffer->SplitIntoFrequencyBands();
|
||||||
|
}
|
||||||
|
|
||||||
|
return voice_detection->ProcessCaptureAudio(audio_buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Processes a specified amount of frames, verifies the results and reports
|
||||||
|
// any errors.
|
||||||
|
void RunBitexactnessTest(int sample_rate_hz,
|
||||||
|
size_t num_channels,
|
||||||
|
bool stream_has_voice_reference) {
|
||||||
|
int sample_rate_to_use = std::min(sample_rate_hz, 16000);
|
||||||
|
VoiceDetection voice_detection(sample_rate_to_use,
|
||||||
|
VoiceDetection::kLowLikelihood);
|
||||||
|
|
||||||
|
int samples_per_channel = rtc::CheckedDivExact(sample_rate_hz, 100);
|
||||||
|
const StreamConfig capture_config(sample_rate_hz, num_channels);
|
||||||
|
AudioBuffer capture_buffer(
|
||||||
|
capture_config.sample_rate_hz(), capture_config.num_channels(),
|
||||||
|
capture_config.sample_rate_hz(), capture_config.num_channels(),
|
||||||
|
capture_config.sample_rate_hz(), capture_config.num_channels());
|
||||||
|
test::InputAudioFile capture_file(
|
||||||
|
test::GetApmCaptureTestVectorFileName(sample_rate_hz));
|
||||||
|
std::vector<float> capture_input(samples_per_channel * num_channels);
|
||||||
|
bool stream_has_voice = false;
|
||||||
|
for (int frame_no = 0; frame_no < kNumFramesToProcess; ++frame_no) {
|
||||||
|
ReadFloatSamplesFromStereoFile(samples_per_channel, num_channels,
|
||||||
|
&capture_file, capture_input);
|
||||||
|
|
||||||
|
test::CopyVectorToAudioBuffer(capture_config, capture_input,
|
||||||
|
&capture_buffer);
|
||||||
|
|
||||||
|
stream_has_voice =
|
||||||
|
ProcessOneFrame(sample_rate_hz, &capture_buffer, &voice_detection);
|
||||||
|
}
|
||||||
|
|
||||||
|
EXPECT_EQ(stream_has_voice_reference, stream_has_voice);
|
||||||
|
}
|
||||||
|
|
||||||
|
const bool kStreamHasVoiceReference = true;
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
TEST(VoiceDetectionBitExactnessTest, Mono8kHz) {
|
||||||
|
RunBitexactnessTest(8000, 1, kStreamHasVoiceReference);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(VoiceDetectionBitExactnessTest, Mono16kHz) {
|
||||||
|
RunBitexactnessTest(16000, 1, kStreamHasVoiceReference);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(VoiceDetectionBitExactnessTest, Mono32kHz) {
|
||||||
|
RunBitexactnessTest(32000, 1, kStreamHasVoiceReference);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(VoiceDetectionBitExactnessTest, Mono48kHz) {
|
||||||
|
RunBitexactnessTest(48000, 1, kStreamHasVoiceReference);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(VoiceDetectionBitExactnessTest, Stereo8kHz) {
|
||||||
|
RunBitexactnessTest(8000, 2, kStreamHasVoiceReference);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(VoiceDetectionBitExactnessTest, Stereo16kHz) {
|
||||||
|
RunBitexactnessTest(16000, 2, kStreamHasVoiceReference);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(VoiceDetectionBitExactnessTest, Stereo32kHz) {
|
||||||
|
RunBitexactnessTest(32000, 2, kStreamHasVoiceReference);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(VoiceDetectionBitExactnessTest, Stereo48kHz) {
|
||||||
|
RunBitexactnessTest(48000, 2, kStreamHasVoiceReference);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace webrtc
|
@ -54,7 +54,7 @@ rtc::scoped_refptr<AudioProcessing> CreateApm(test::FuzzDataHelper* fuzz_data,
|
|||||||
bool use_agc = fuzz_data->ReadOrDefaultValue(true);
|
bool use_agc = fuzz_data->ReadOrDefaultValue(true);
|
||||||
bool use_ns = fuzz_data->ReadOrDefaultValue(true);
|
bool use_ns = fuzz_data->ReadOrDefaultValue(true);
|
||||||
static_cast<void>(fuzz_data->ReadOrDefaultValue(true));
|
static_cast<void>(fuzz_data->ReadOrDefaultValue(true));
|
||||||
static_cast<void>(fuzz_data->ReadOrDefaultValue(true));
|
bool use_vad = fuzz_data->ReadOrDefaultValue(true);
|
||||||
bool use_agc_limiter = fuzz_data->ReadOrDefaultValue(true);
|
bool use_agc_limiter = fuzz_data->ReadOrDefaultValue(true);
|
||||||
bool use_agc2 = fuzz_data->ReadOrDefaultValue(true);
|
bool use_agc2 = fuzz_data->ReadOrDefaultValue(true);
|
||||||
|
|
||||||
@ -114,6 +114,7 @@ rtc::scoped_refptr<AudioProcessing> CreateApm(test::FuzzDataHelper* fuzz_data,
|
|||||||
use_agc2_adaptive_digital;
|
use_agc2_adaptive_digital;
|
||||||
apm_config.noise_suppression.enabled = use_ns;
|
apm_config.noise_suppression.enabled = use_ns;
|
||||||
apm_config.transient_suppression.enabled = use_ts;
|
apm_config.transient_suppression.enabled = use_ts;
|
||||||
|
apm_config.voice_detection.enabled = use_vad;
|
||||||
|
|
||||||
rtc::scoped_refptr<AudioProcessing> apm =
|
rtc::scoped_refptr<AudioProcessing> apm =
|
||||||
AudioProcessingBuilderForTesting()
|
AudioProcessingBuilderForTesting()
|
||||||
|
Reference in New Issue
Block a user