blob: 4bdbb488e5ccbf527bfd01f5cd9262400944dd03 [file] [log] [blame]
// Copyright 2013 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "content/browser/speech/speech_recognizer_impl.h"
#include <stddef.h>
#include <stdint.h>
#include <memory>
#include <vector>
#include "base/compiler_specific.h"
#include "base/containers/span.h"
#include "base/functional/bind.h"
#include "base/functional/callback_helpers.h"
#include "base/memory/ref_counted.h"
#include "base/memory/scoped_refptr.h"
#include "base/numerics/byte_conversions.h"
#include "base/run_loop.h"
#include "base/strings/string_view_util.h"
#include "base/synchronization/waitable_event.h"
#include "base/test/scoped_feature_list.h"
#include "base/threading/thread.h"
#include "content/browser/speech/network_speech_recognition_engine_impl.h"
#include "content/public/browser/google_streaming_api.pb.h"
#include "content/public/browser/speech_recognition_audio_forwarder_config.h"
#include "content/public/browser/speech_recognition_event_listener.h"
#include "content/public/common/content_features.h"
#include "content/public/test/browser_task_environment.h"
#include "media/audio/audio_device_description.h"
#include "media/audio/audio_system_impl.h"
#include "media/audio/fake_audio_input_stream.h"
#include "media/audio/fake_audio_output_stream.h"
#include "media/audio/mock_audio_manager.h"
#include "media/audio/test_audio_thread.h"
#include "media/base/audio_bus.h"
#include "media/base/audio_glitch_info.h"
#include "media/base/audio_sample_types.h"
#include "media/base/test_helpers.h"
#include "mojo/public/cpp/bindings/remote.h"
#include "mojo/public/cpp/system/data_pipe.h"
#include "mojo/public/cpp/system/data_pipe_utils.h"
#include "net/base/net_errors.h"
#include "net/http/http_response_headers.h"
#include "net/http/http_util.h"
#include "services/network/public/cpp/url_loader_completion_status.h"
#include "services/network/public/cpp/weak_wrapper_shared_url_loader_factory.h"
#include "services/network/public/mojom/url_response_head.mojom.h"
#include "services/network/test/test_url_loader_factory.h"
#include "testing/gtest/include/gtest/gtest.h"
using media::AudioInputStream;
using media::AudioOutputStream;
using media::AudioParameters;
namespace content {
namespace {
class MockCapturerSource : public media::AudioCapturerSource {
public:
MockCapturerSource() = default;
MOCK_METHOD2(Initialize,
void(const media::AudioParameters& params,
CaptureCallback* callback));
MOCK_METHOD0(Start, void());
MOCK_METHOD0(Stop, void());
MOCK_METHOD1(SetAutomaticGainControl, void(bool enable));
MOCK_METHOD1(SetVolume, void(double volume));
MOCK_METHOD1(SetOutputDeviceForAec,
void(const std::string& output_device_id));
protected:
~MockCapturerSource() override = default;
};
} // namespace
class SpeechRecognizerImplTest : public SpeechRecognitionEventListener,
public testing::Test {
public:
SpeechRecognizerImplTest()
: audio_capturer_source_(
base::MakeRefCounted<testing::NiceMock<MockCapturerSource>>()),
recognition_started_(false),
recognition_ended_(false),
result_received_(false),
audio_started_(false),
audio_ended_(false),
sound_started_(false),
sound_ended_(false),
error_(media::mojom::SpeechRecognitionErrorCode::kNone),
volume_(-1.0f) {
// This test environment is not set up to support out-of-process services.
feature_list_.InitWithFeatures(
/*enabled_features=*/{},
/*disabled_features=*/{features::kAudioServiceOutOfProcess});
// SpeechRecognizer takes ownership of sr_engine.
std::unique_ptr<NetworkSpeechRecognitionEngineImpl> sr_engine =
std::make_unique<NetworkSpeechRecognitionEngineImpl>(
base::MakeRefCounted<network::WeakWrapperSharedURLLoaderFactory>(
&url_loader_factory_));
NetworkSpeechRecognitionEngineImpl::Config config;
config.audio_num_bits_per_sample =
SpeechRecognizerImpl::kNumBitsPerAudioSample;
config.audio_sample_rate = SpeechRecognizerImpl::kAudioSampleRate;
config.filter_profanities = false;
sr_engine->SetConfig(config);
const int kTestingSessionId = 1;
audio_manager_ = std::make_unique<media::MockAudioManager>(
std::make_unique<media::TestAudioThread>(true));
audio_manager_->SetInputStreamParameters(
media::AudioParameters::UnavailableDeviceParams());
audio_system_ =
std::make_unique<media::AudioSystemImpl>(audio_manager_.get());
SpeechRecognizerImpl::SetAudioEnvironmentForTesting(
audio_system_.get(), audio_capturer_source_.get());
recognizer_ = new SpeechRecognizerImpl(this, audio_system_.get(),
kTestingSessionId, false, false,
std::move(sr_engine), std::nullopt);
int audio_packet_length_bytes =
(SpeechRecognizerImpl::kAudioSampleRate *
NetworkSpeechRecognitionEngineImpl::kAudioPacketIntervalMs *
ChannelLayoutToChannelCount(SpeechRecognizerImpl::kChannelLayout) *
SpeechRecognizerImpl::kNumBitsPerAudioSample) /
(8 * 1000);
audio_packet_.resize(audio_packet_length_bytes);
const int channels =
ChannelLayoutToChannelCount(SpeechRecognizerImpl::kChannelLayout);
int bytes_per_sample = SpeechRecognizerImpl::kNumBitsPerAudioSample / 8;
const int frames = audio_packet_length_bytes / channels / bytes_per_sample;
audio_bus_ = media::AudioBus::Create(channels, frames);
audio_bus_->Zero();
}
~SpeechRecognizerImplTest() override {
SpeechRecognizerImpl::SetAudioEnvironmentForTesting(nullptr, nullptr);
audio_manager_->Shutdown();
}
[[nodiscard]] bool GetUpstreamRequest(
const network::TestURLLoaderFactory::PendingRequest**
pending_request_out) {
return GetPendingRequest(pending_request_out, "/up");
}
[[nodiscard]] bool GetDownstreamRequest(
const network::TestURLLoaderFactory::PendingRequest**
pending_request_out) {
return GetPendingRequest(pending_request_out, "/down");
}
[[nodiscard]] bool GetPendingRequest(
const network::TestURLLoaderFactory::PendingRequest** pending_request_out,
const char* url_substring) {
for (const auto& pending_request :
*url_loader_factory_.pending_requests()) {
if (pending_request.request.url.spec().find(url_substring) !=
std::string::npos) {
*pending_request_out = &pending_request;
return true;
}
}
return false;
}
void CheckEventsConsistency() {
// Note: "!x || y" == "x implies y".
EXPECT_TRUE(!recognition_ended_ || recognition_started_);
EXPECT_TRUE(!audio_ended_ || audio_started_);
EXPECT_TRUE(!sound_ended_ || sound_started_);
EXPECT_TRUE(!audio_started_ || recognition_started_);
EXPECT_TRUE(!sound_started_ || audio_started_);
EXPECT_TRUE(!audio_ended_ || (sound_ended_ || !sound_started_));
EXPECT_TRUE(!recognition_ended_ || (audio_ended_ || !audio_started_));
}
void CheckFinalEventsConsistency() {
// Note: "!(x ^ y)" == "(x && y) || (!x && !x)".
EXPECT_FALSE(recognition_started_ ^ recognition_ended_);
EXPECT_FALSE(audio_started_ ^ audio_ended_);
EXPECT_FALSE(sound_started_ ^ sound_ended_);
}
// Overridden from SpeechRecognitionEventListener:
void OnAudioStart(int session_id) override {
audio_started_ = true;
CheckEventsConsistency();
}
void OnAudioEnd(int session_id) override {
audio_ended_ = true;
CheckEventsConsistency();
}
void OnRecognitionResults(
int session_id,
const std::vector<media::mojom::WebSpeechRecognitionResultPtr>& results)
override {
result_received_ = true;
}
void OnRecognitionError(
int session_id,
const media::mojom::SpeechRecognitionError& error) override {
EXPECT_TRUE(recognition_started_);
EXPECT_FALSE(recognition_ended_);
error_ = error.code;
}
void OnAudioLevelsChange(int session_id,
float volume,
float noise_volume) override {
volume_ = volume;
noise_volume_ = noise_volume;
}
void OnRecognitionEnd(int session_id) override {
recognition_ended_ = true;
CheckEventsConsistency();
}
void OnRecognitionStart(int session_id) override {
recognition_started_ = true;
CheckEventsConsistency();
}
void OnSoundStart(int session_id) override {
sound_started_ = true;
CheckEventsConsistency();
}
void OnSoundEnd(int session_id) override {
sound_ended_ = true;
CheckEventsConsistency();
}
void CopyPacketToAudioBus() {
static_assert(SpeechRecognizerImpl::kNumBitsPerAudioSample == 16,
"FromInterleaved expects 2 bytes.");
// Copy the created signal into an audio bus in a deinterleaved format.
audio_bus_->FromInterleaved<media::SignedInt16SampleTypeTraits>(
UNSAFE_TODO(reinterpret_cast<int16_t*>(audio_packet_.data())),
audio_bus_->frames());
}
void FillPacketWithTestWaveform() {
// Fill the input with a simple pattern, a 125Hz sawtooth waveform.
for (size_t i = 0; i < audio_packet_.size(); ++i)
audio_packet_[i] = static_cast<uint8_t>(i);
CopyPacketToAudioBus();
}
void FillPacketWithNoise() {
int value = 0;
int factor = 175;
for (size_t i = 0; i < audio_packet_.size(); ++i) {
value += factor;
audio_packet_[i] = value % 100;
}
CopyPacketToAudioBus();
}
void Capture(media::AudioBus* data) {
auto* capture_callback =
static_cast<media::AudioCapturerSource::CaptureCallback*>(
recognizer_.get());
capture_callback->Capture(data, base::TimeTicks::Now(), {}, 0.0);
}
void OnCaptureError() {
auto* capture_callback =
static_cast<media::AudioCapturerSource::CaptureCallback*>(
recognizer_.get());
capture_callback->OnCaptureError(
media::AudioCapturerSource::ErrorCode::kUnknown, "");
}
void WaitForAudioThreadToPostDeviceInfo() {
media::WaitableMessageLoopEvent event;
audio_manager_->GetTaskRunner()->PostTaskAndReply(
FROM_HERE, base::DoNothing(), event.GetClosure());
// Runs the loop and waits for the audio thread to call event's closure,
// which means AudioSystem reply containing device parameters is already
// queued on the main thread.
event.RunAndWait();
}
protected:
base::test::ScopedFeatureList feature_list_;
BrowserTaskEnvironment task_environment_;
network::TestURLLoaderFactory url_loader_factory_;
scoped_refptr<SpeechRecognizerImpl> recognizer_;
std::unique_ptr<media::MockAudioManager> audio_manager_;
std::unique_ptr<media::AudioSystem> audio_system_;
scoped_refptr<MockCapturerSource> audio_capturer_source_;
bool recognition_started_;
bool recognition_ended_;
bool result_received_;
bool audio_started_;
bool audio_ended_;
bool sound_started_;
bool sound_ended_;
media::mojom::SpeechRecognitionErrorCode error_;
std::vector<uint8_t> audio_packet_;
std::unique_ptr<media::AudioBus> audio_bus_;
float volume_;
float noise_volume_;
};
TEST_F(SpeechRecognizerImplTest, StartNoInputDevices) {
// Check for callbacks when stopping record before any audio gets recorded.
audio_manager_->SetHasInputDevices(false);
recognizer_->StartRecognition(
media::AudioDeviceDescription::kDefaultDeviceId);
base::RunLoop().RunUntilIdle(); // EVENT_PREPARE processing.
WaitForAudioThreadToPostDeviceInfo();
base::RunLoop().RunUntilIdle(); // EVENT_START processing.
EXPECT_TRUE(recognition_started_);
EXPECT_FALSE(audio_started_);
EXPECT_FALSE(result_received_);
OnCaptureError();
base::RunLoop().RunUntilIdle();
EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kAudioCapture, error_);
CheckFinalEventsConsistency();
}
TEST_F(SpeechRecognizerImplTest, StartFakeInputDevice) {
// Check for callbacks when stopping record before any audio gets recorded.
audio_manager_->SetHasInputDevices(false);
recognizer_->StartRecognition(
media::AudioDeviceDescription::kDefaultDeviceId);
base::RunLoop().RunUntilIdle(); // EVENT_PREPARE processing.
WaitForAudioThreadToPostDeviceInfo();
base::RunLoop().RunUntilIdle(); // EVENT_START processing.
Capture(audio_bus_.get());
recognizer_->StopAudioCapture();
base::RunLoop().RunUntilIdle();
EXPECT_TRUE(recognition_started_);
EXPECT_TRUE(audio_started_);
EXPECT_FALSE(result_received_);
EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNone, error_);
recognizer_->AbortRecognition();
base::RunLoop().RunUntilIdle();
CheckFinalEventsConsistency();
}
TEST_F(SpeechRecognizerImplTest, StopBeforeDeviceInfoReceived) {
// Check for callbacks when stopping record before reply is received from
// AudioSystem.
base::WaitableEvent event(base::WaitableEvent::ResetPolicy::AUTOMATIC,
base::WaitableEvent::InitialState::NOT_SIGNALED);
// Block audio thread.
audio_manager_->GetTaskRunner()->PostTask(
FROM_HERE,
base::BindOnce(&base::WaitableEvent::Wait, base::Unretained(&event)));
recognizer_->StartRecognition(
media::AudioDeviceDescription::kDefaultDeviceId);
recognizer_->StopAudioCapture();
base::RunLoop().RunUntilIdle();
// Release audio thread and receive a callback from it.
event.Signal();
WaitForAudioThreadToPostDeviceInfo();
base::RunLoop().RunUntilIdle();
EXPECT_TRUE(recognition_started_);
EXPECT_FALSE(audio_started_);
EXPECT_FALSE(result_received_);
EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNone, error_);
CheckFinalEventsConsistency();
}
TEST_F(SpeechRecognizerImplTest, CancelBeforeDeviceInfoReceived) {
// Check for callbacks when stopping record before reply is received from
// AudioSystem.
base::WaitableEvent event(base::WaitableEvent::ResetPolicy::AUTOMATIC,
base::WaitableEvent::InitialState::NOT_SIGNALED);
// Block audio thread.
audio_manager_->GetTaskRunner()->PostTask(
FROM_HERE,
base::BindOnce(&base::WaitableEvent::Wait, base::Unretained(&event)));
recognizer_->StartRecognition(
media::AudioDeviceDescription::kDefaultDeviceId);
recognizer_->AbortRecognition();
base::RunLoop().RunUntilIdle();
// Release audio thread and receive a callback from it.
event.Signal();
WaitForAudioThreadToPostDeviceInfo();
base::RunLoop().RunUntilIdle();
EXPECT_TRUE(recognition_started_);
EXPECT_FALSE(audio_started_);
EXPECT_FALSE(result_received_);
EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNone, error_);
CheckFinalEventsConsistency();
}
TEST_F(SpeechRecognizerImplTest, StopNoData) {
// Check for callbacks when stopping record before any audio gets recorded.
recognizer_->StartRecognition(
media::AudioDeviceDescription::kDefaultDeviceId);
base::RunLoop().RunUntilIdle(); // EVENT_PREPARE processing.
WaitForAudioThreadToPostDeviceInfo();
recognizer_->StopAudioCapture();
base::RunLoop().RunUntilIdle(); // EVENT_START and EVENT_STOP processing.
EXPECT_TRUE(recognition_started_);
EXPECT_FALSE(audio_started_);
EXPECT_FALSE(result_received_);
EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNone, error_);
CheckFinalEventsConsistency();
}
TEST_F(SpeechRecognizerImplTest, CancelNoData) {
// Check for callbacks when canceling recognition before any audio gets
// recorded.
recognizer_->StartRecognition(
media::AudioDeviceDescription::kDefaultDeviceId);
base::RunLoop().RunUntilIdle(); // EVENT_PREPARE processing.
WaitForAudioThreadToPostDeviceInfo();
recognizer_->AbortRecognition();
base::RunLoop().RunUntilIdle(); // EVENT_START and EVENT_ABORT processing.
EXPECT_TRUE(recognition_started_);
EXPECT_FALSE(audio_started_);
EXPECT_FALSE(result_received_);
EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kAborted, error_);
CheckFinalEventsConsistency();
}
TEST_F(SpeechRecognizerImplTest, StopWithData) {
// Start recording, give some data and then stop. This should wait for the
// network callback to arrive before completion.
recognizer_->StartRecognition(
media::AudioDeviceDescription::kDefaultDeviceId);
base::RunLoop().RunUntilIdle(); // EVENT_PREPARE processing.
WaitForAudioThreadToPostDeviceInfo();
base::RunLoop().RunUntilIdle(); // EVENT_START processing.
// Try sending 5 chunks of mock audio data and verify that each of them
// resulted immediately in a packet sent out via the network. This verifies
// that we are streaming out encoded data as chunks without waiting for the
// full recording to complete.
const size_t kNumChunks = 5;
mojo::Remote<network::mojom::ChunkedDataPipeGetter> chunked_data_pipe_getter;
mojo::ScopedDataPipeProducerHandle producer_handle;
mojo::ScopedDataPipeConsumerHandle consumer_handle;
ASSERT_EQ(mojo::CreateDataPipe(nullptr, producer_handle, consumer_handle),
MOJO_RESULT_OK);
for (size_t i = 0; i < kNumChunks; ++i) {
Capture(audio_bus_.get());
if (i == 0) {
// Set up data channel to read chunked upload data. Must be done after the
// first OnData() call.
base::RunLoop().RunUntilIdle();
const network::TestURLLoaderFactory::PendingRequest* upstream_request;
ASSERT_TRUE(GetUpstreamRequest(&upstream_request));
ASSERT_TRUE(upstream_request->request.request_body);
ASSERT_EQ(1u, upstream_request->request.request_body->elements()->size());
auto& element =
(*upstream_request->request.request_body->elements_mutable())[0];
ASSERT_EQ(network::DataElement::Tag::kChunkedDataPipe, element.type());
chunked_data_pipe_getter.Bind(
element.As<network::DataElementChunkedDataPipe>()
.ReleaseChunkedDataPipeGetter());
chunked_data_pipe_getter->StartReading(std::move(producer_handle));
}
std::string data;
while (true) {
base::RunLoop().RunUntilIdle();
base::span<const uint8_t> buffer;
MojoResult result =
consumer_handle->BeginReadData(MOJO_READ_DATA_FLAG_NONE, buffer);
if (result == MOJO_RESULT_OK) {
data.append(base::as_string_view(buffer));
consumer_handle->EndReadData(buffer.size());
continue;
}
if (result == MOJO_RESULT_SHOULD_WAIT) {
// Some data has already been read, assume there's no more to read.
if (!data.empty())
break;
continue;
}
FAIL() << "Mojo pipe closed unexpectedly";
}
EXPECT_FALSE(data.empty());
}
recognizer_->StopAudioCapture();
base::RunLoop().RunUntilIdle();
EXPECT_TRUE(audio_started_);
EXPECT_TRUE(audio_ended_);
EXPECT_FALSE(recognition_ended_);
EXPECT_FALSE(result_received_);
EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNone, error_);
// Create a response string.
proto::SpeechRecognitionEvent proto_event;
proto_event.set_status(proto::SpeechRecognitionEvent::STATUS_SUCCESS);
proto::SpeechRecognitionResult* proto_result = proto_event.add_result();
proto_result->set_final(true);
proto::SpeechRecognitionAlternative* proto_alternative =
proto_result->add_alternative();
proto_alternative->set_confidence(0.5f);
proto_alternative->set_transcript("123");
std::string msg_string;
proto_event.SerializeToString(&msg_string);
msg_string.insert(0u, base::as_string_view(base::U32ToBigEndian(
base::checked_cast<uint32_t>(msg_string.size()))));
// Issue the network callback to complete the process.
const network::TestURLLoaderFactory::PendingRequest* downstream_request;
ASSERT_TRUE(GetDownstreamRequest(&downstream_request));
url_loader_factory_.AddResponse(downstream_request->request.url.spec(),
msg_string);
base::RunLoop().RunUntilIdle();
EXPECT_TRUE(recognition_ended_);
EXPECT_TRUE(result_received_);
EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNone, error_);
CheckFinalEventsConsistency();
}
TEST_F(SpeechRecognizerImplTest, CancelWithData) {
// Start recording, give some data and then cancel.
recognizer_->StartRecognition(
media::AudioDeviceDescription::kDefaultDeviceId);
base::RunLoop().RunUntilIdle(); // EVENT_PREPARE processing.
WaitForAudioThreadToPostDeviceInfo();
base::RunLoop().RunUntilIdle(); // EVENT_START processing.
Capture(audio_bus_.get());
base::RunLoop().RunUntilIdle();
recognizer_->AbortRecognition();
base::RunLoop().RunUntilIdle();
// There should be both upstream and downstream pending requests.
ASSERT_EQ(2u, url_loader_factory_.pending_requests()->size());
EXPECT_TRUE(recognition_started_);
EXPECT_TRUE(audio_started_);
EXPECT_FALSE(result_received_);
EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kAborted, error_);
CheckFinalEventsConsistency();
}
TEST_F(SpeechRecognizerImplTest, ConnectionError) {
// Start recording, give some data and then stop. Issue the network callback
// with a connection error and verify that the recognizer bubbles the error up
recognizer_->StartRecognition(
media::AudioDeviceDescription::kDefaultDeviceId);
base::RunLoop().RunUntilIdle(); // EVENT_PREPARE processing.
WaitForAudioThreadToPostDeviceInfo();
base::RunLoop().RunUntilIdle(); // EVENT_START processing.
Capture(audio_bus_.get());
base::RunLoop().RunUntilIdle();
// There should be both upstream and downstream pending requests.
ASSERT_EQ(2u, url_loader_factory_.pending_requests()->size());
recognizer_->StopAudioCapture();
base::RunLoop().RunUntilIdle();
EXPECT_TRUE(audio_started_);
EXPECT_TRUE(audio_ended_);
EXPECT_FALSE(recognition_ended_);
EXPECT_FALSE(result_received_);
EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNone, error_);
// Issue the network callback to complete the process.
const network::TestURLLoaderFactory::PendingRequest* pending_request;
ASSERT_TRUE(GetUpstreamRequest(&pending_request));
url_loader_factory_.AddResponse(
pending_request->request.url, network::mojom::URLResponseHead::New(), "",
network::URLLoaderCompletionStatus(net::ERR_CONNECTION_REFUSED));
base::RunLoop().RunUntilIdle();
EXPECT_TRUE(recognition_ended_);
EXPECT_FALSE(result_received_);
EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNetwork, error_);
CheckFinalEventsConsistency();
}
TEST_F(SpeechRecognizerImplTest, ServerError) {
// Start recording, give some data and then stop. Issue the network callback
// with a 500 error and verify that the recognizer bubbles the error up
recognizer_->StartRecognition(
media::AudioDeviceDescription::kDefaultDeviceId);
base::RunLoop().RunUntilIdle(); // EVENT_PREPARE processing.
WaitForAudioThreadToPostDeviceInfo();
base::RunLoop().RunUntilIdle(); // EVENT_START processing.
Capture(audio_bus_.get());
base::RunLoop().RunUntilIdle();
// There should be both upstream and downstream pending requests.
ASSERT_EQ(2u, url_loader_factory_.pending_requests()->size());
recognizer_->StopAudioCapture();
base::RunLoop().RunUntilIdle();
EXPECT_TRUE(audio_started_);
EXPECT_TRUE(audio_ended_);
EXPECT_FALSE(recognition_ended_);
EXPECT_FALSE(result_received_);
EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNone, error_);
const network::TestURLLoaderFactory::PendingRequest* pending_request;
ASSERT_TRUE(GetUpstreamRequest(&pending_request));
auto response = network::mojom::URLResponseHead::New();
const char kHeaders[] = "HTTP/1.0 500 Internal Server Error";
response->headers = base::MakeRefCounted<net::HttpResponseHeaders>(
net::HttpUtil::AssembleRawHeaders(kHeaders));
url_loader_factory_.AddResponse(pending_request->request.url,
std::move(response), "",
network::URLLoaderCompletionStatus());
base::RunLoop().RunUntilIdle();
EXPECT_TRUE(recognition_ended_);
EXPECT_FALSE(result_received_);
EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNetwork, error_);
CheckFinalEventsConsistency();
}
TEST_F(SpeechRecognizerImplTest, OnCaptureError_PropagatesError) {
// Check if things tear down properly if AudioInputController threw an error.
recognizer_->StartRecognition(
media::AudioDeviceDescription::kDefaultDeviceId);
base::RunLoop().RunUntilIdle(); // EVENT_PREPARE processing.
WaitForAudioThreadToPostDeviceInfo();
base::RunLoop().RunUntilIdle(); // EVENT_START processing.
OnCaptureError();
base::RunLoop().RunUntilIdle();
EXPECT_TRUE(recognition_started_);
EXPECT_FALSE(audio_started_);
EXPECT_FALSE(result_received_);
EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kAudioCapture, error_);
CheckFinalEventsConsistency();
}
TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackIssued) {
// Start recording and give a lot of packets with audio samples set to zero.
// This should trigger the no-speech detector and issue a callback.
recognizer_->StartRecognition(
media::AudioDeviceDescription::kDefaultDeviceId);
base::RunLoop().RunUntilIdle(); // EVENT_PREPARE processing.
WaitForAudioThreadToPostDeviceInfo();
base::RunLoop().RunUntilIdle(); // EVENT_START processing.
int num_packets =
(SpeechRecognizerImpl::kNoSpeechTimeoutMs) /
NetworkSpeechRecognitionEngineImpl::kAudioPacketIntervalMs +
1;
// The vector is already filled with zero value samples on create.
for (int i = 0; i < num_packets; ++i) {
Capture(audio_bus_.get());
}
base::RunLoop().RunUntilIdle();
EXPECT_TRUE(recognition_started_);
EXPECT_TRUE(audio_started_);
EXPECT_FALSE(result_received_);
EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNoSpeech, error_);
CheckFinalEventsConsistency();
}
TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackNotIssued) {
// Start recording and give a lot of packets with audio samples set to zero
// and then some more with reasonably loud audio samples. This should be
// treated as normal speech input and the no-speech detector should not get
// triggered.
recognizer_->StartRecognition(
media::AudioDeviceDescription::kDefaultDeviceId);
base::RunLoop().RunUntilIdle(); // EVENT_PREPARE processing.
WaitForAudioThreadToPostDeviceInfo();
base::RunLoop().RunUntilIdle(); // EVENT_START processing.
int num_packets = (SpeechRecognizerImpl::kNoSpeechTimeoutMs) /
NetworkSpeechRecognitionEngineImpl::kAudioPacketIntervalMs;
// The vector is already filled with zero value samples on create.
for (int i = 0; i < num_packets / 2; ++i) {
Capture(audio_bus_.get());
}
FillPacketWithTestWaveform();
for (int i = 0; i < num_packets / 2; ++i) {
Capture(audio_bus_.get());
}
base::RunLoop().RunUntilIdle();
EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNone, error_);
EXPECT_TRUE(audio_started_);
EXPECT_FALSE(audio_ended_);
EXPECT_FALSE(recognition_ended_);
recognizer_->AbortRecognition();
base::RunLoop().RunUntilIdle();
CheckFinalEventsConsistency();
}
TEST_F(SpeechRecognizerImplTest, SetInputVolumeCallback) {
// Start recording and give a lot of packets with audio samples set to zero
// and then some more with reasonably loud audio samples. Check that we don't
// get the callback during estimation phase, then get zero for the silence
// samples and proper volume for the loud audio.
recognizer_->StartRecognition(
media::AudioDeviceDescription::kDefaultDeviceId);
base::RunLoop().RunUntilIdle(); // EVENT_PREPARE processing.
WaitForAudioThreadToPostDeviceInfo();
base::RunLoop().RunUntilIdle(); // EVENT_START processing.
// Feed some samples to begin with for the endpointer to do noise estimation.
int num_packets = SpeechRecognizerImpl::kEndpointerEstimationTimeMs /
NetworkSpeechRecognitionEngineImpl::kAudioPacketIntervalMs;
FillPacketWithNoise();
for (int i = 0; i < num_packets; ++i) {
Capture(audio_bus_.get());
}
base::RunLoop().RunUntilIdle();
EXPECT_EQ(-1.0f, volume_); // No audio volume set yet.
// The vector is already filled with zero value samples on create.
Capture(audio_bus_.get());
base::RunLoop().RunUntilIdle();
EXPECT_FLOAT_EQ(0.74939233f, volume_);
FillPacketWithTestWaveform();
Capture(audio_bus_.get());
base::RunLoop().RunUntilIdle();
EXPECT_NEAR(0.89926866f, volume_, 0.00001f);
EXPECT_FLOAT_EQ(0.75071919f, noise_volume_);
EXPECT_EQ(media::mojom::SpeechRecognitionErrorCode::kNone, error_);
EXPECT_FALSE(audio_ended_);
EXPECT_FALSE(recognition_ended_);
recognizer_->AbortRecognition();
base::RunLoop().RunUntilIdle();
CheckFinalEventsConsistency();
}
} // namespace content