blob: 23ddd9c64ba261d15976ee591afd4d7d61cf059c [file] [log] [blame]
// Copyright 2018 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "content/browser/speech/tts_controller_impl.h"
#include <stddef.h>
#include <algorithm>
#include <string>
#include <vector>
#include "base/containers/queue.h"
#include "base/functional/bind.h"
#include "base/json/json_reader.h"
#include "base/metrics/histogram_macros.h"
#include "base/metrics/user_metrics.h"
#include "base/observer_list.h"
#include "base/strings/string_util.h"
#include "base/task/single_thread_task_runner.h"
#include "base/values.h"
#include "build/build_config.h"
#include "content/browser/speech/tts_utterance_impl.h"
#include "content/public/browser/content_browser_client.h"
#include "content/public/browser/tts_utterance.h"
#include "content/public/browser/visibility.h"
#include "content/public/browser/web_contents.h"
#include "content/public/common/content_client.h"
#include "services/data_decoder/public/cpp/safe_xml_parser.h"
#include "services/data_decoder/public/mojom/xml_parser.mojom.h"
#include "third_party/blink/public/mojom/speech/speech_synthesis.mojom.h"
#include "ui/base/l10n/l10n_util.h"
#if BUILDFLAG(IS_CHROMEOS)
#include "content/public/browser/tts_controller_delegate.h"
#endif
namespace content {
namespace {
// A value to be used to indicate that there is no char index available.
const int kInvalidCharIndex = -1;
// A value to be used to indicate that there is no length available.
const int kInvalidLength = -1;
#if BUILDFLAG(IS_CHROMEOS)
bool VoiceIdMatches(
const std::optional<TtsControllerDelegate::PreferredVoiceId>& id,
const content::VoiceData& voice) {
if (!id.has_value() || voice.name.empty() ||
(voice.engine_id.empty() && !voice.native))
return false;
if (voice.native)
return id->name == voice.name && id->id.empty();
return id->name == voice.name && id->id == voice.engine_id;
}
#endif // BUILDFLAG(IS_CHROMEOS)
TtsUtteranceImpl* AsUtteranceImpl(TtsUtterance* utterance) {
return static_cast<TtsUtteranceImpl*>(utterance);
}
bool IsUtteranceSpokenByRemoteEngine(TtsUtterance* utterance) {
if (utterance && !utterance->GetEngineId().empty()) {
TtsUtteranceImpl* utterance_impl = AsUtteranceImpl(utterance);
return utterance_impl->spoken_by_remote_engine();
}
return false;
}
} // namespace
//
// VoiceData
//
VoiceData::VoiceData() : remote(false), native(false) {}
VoiceData::VoiceData(const VoiceData& other) = default;
VoiceData::~VoiceData() {}
//
// TtsController
//
TtsController* TtsController::GetInstance() {
return TtsControllerImpl::GetInstance();
}
void TtsController::SkipAddNetworkChangeObserverForTests(bool enabled) {
return TtsControllerImpl::SkipAddNetworkChangeObserverForTests(enabled);
}
// IMPORTANT!
// These values are written to logs. Do not renumber or delete
// existing items; add new entries to the end of the list.
// LINT.IfChange(UMATextToSpeechEvent)
enum class UMATextToSpeechEvent {
START = 0,
END = 1,
WORD = 2,
SENTENCE = 3,
MARKER = 4,
INTERRUPTED = 5,
CANCELLED = 6,
SPEECH_ERROR = 7,
PAUSE = 8,
RESUME = 9,
// This must always be the last enum. It's okay for its value to
// increase, but none of the other enum values may change.
COUNT
};
// LINT.ThenChange(/tools/metrics/histograms/metadata/accessibility/enums.xml:TextToSpeechEvent)
//
// TtsControllerImpl
//
// static
bool TtsControllerImpl::skip_add_network_change_observer_for_tests_ = false;
// static
TtsControllerImpl* TtsControllerImpl::GetInstance() {
return base::Singleton<TtsControllerImpl>::get();
}
// static
void TtsControllerImpl::SkipAddNetworkChangeObserverForTests(bool enabled) {
TtsControllerImpl::skip_add_network_change_observer_for_tests_ = enabled;
}
void TtsControllerImpl::SetStopSpeakingWhenHidden(bool value) {
stop_speaking_when_hidden_ = value;
}
TtsControllerImpl::TtsControllerImpl() {
if (!skip_add_network_change_observer_for_tests_) {
net::NetworkChangeNotifier::AddNetworkChangeObserver(this);
}
OnNetworkChanged(net::NetworkChangeNotifier::GetConnectionType());
}
TtsControllerImpl::~TtsControllerImpl() {
if (current_utterance_) {
current_utterance_->Finish();
SetCurrentUtterance(nullptr);
}
// Clear any queued utterances too.
ClearUtteranceQueue(false); // Don't sent events.
net::NetworkChangeNotifier::RemoveNetworkChangeObserver(this);
}
void TtsControllerImpl::SpeakOrEnqueue(
std::unique_ptr<TtsUtterance> utterance) {
if (!ShouldSpeakUtterance(utterance.get())) {
utterance->Finish();
return;
}
// If the TTS platform or tts engine delegate is still loading or
// initializing, queue or flush the utterance. The utterances can be sent to
// platform specific implementation or to the engine implementation. Every
// utterances are postponed until the platform specific implementation and
// built in tts engine are loaded to avoid races where the utterance gets
// dropped unexpectedly.
if (TtsPlatformLoading() ||
(engine_delegate_ && !engine_delegate_->IsBuiltInTtsEngineInitialized(
utterance->GetBrowserContext()))) {
GetTtsPlatform()->LoadBuiltInTtsEngine(utterance->GetBrowserContext());
if (utterance->GetShouldClearQueue())
ClearUtteranceQueue(true);
utterance_list_.emplace_back(std::move(utterance));
return;
}
// If we're paused and we get an utterance that can't be queued,
// flush the queue but stay in the paused state.
if (paused_ && utterance->GetShouldClearQueue()) {
Stop();
utterance_list_.emplace_back(std::move(utterance));
paused_ = true;
return;
}
if (paused_ || (IsSpeaking() && !utterance->GetShouldClearQueue())) {
utterance_list_.emplace_back(std::move(utterance));
} else {
Stop();
SpeakNow(std::move(utterance));
}
}
void TtsControllerImpl::Stop() {
Stop(GURL());
}
void TtsControllerImpl::Stop(const GURL& source_url) {
StopAndClearQueue(source_url);
}
void TtsControllerImpl::StopAndClearQueue(const GURL& source_url) {
if (StopCurrentUtteranceIfMatches(source_url))
ClearUtteranceQueue(true);
}
bool TtsControllerImpl::StopCurrentUtteranceIfMatches(const GURL& source_url) {
base::RecordAction(base::UserMetricsAction("TextToSpeech.Stop"));
paused_ = false;
if (!source_url.is_empty() && current_utterance_ &&
current_utterance_->GetSrcUrl().DeprecatedGetOriginAsURL() !=
source_url.DeprecatedGetOriginAsURL())
return false;
StopCurrentUtterance();
return true;
}
void TtsControllerImpl::StopCurrentUtterance() {
bool spoken_by_remote_engine =
IsUtteranceSpokenByRemoteEngine(current_utterance_.get());
if (engine_delegate_ && current_utterance_ &&
!current_utterance_->GetEngineId().empty() && !spoken_by_remote_engine) {
engine_delegate_->Stop(current_utterance_.get());
} else if (TtsPlatformReady()) {
GetTtsPlatform()->ClearError();
GetTtsPlatform()->StopSpeaking();
}
if (current_utterance_) {
current_utterance_->OnTtsEvent(TTS_EVENT_INTERRUPTED, kInvalidCharIndex,
kInvalidLength, std::string());
}
FinishCurrentUtterance();
}
void TtsControllerImpl::Pause() {
base::RecordAction(base::UserMetricsAction("TextToSpeech.Pause"));
if (paused_)
return;
paused_ = true;
bool spoken_by_remote_engine =
IsUtteranceSpokenByRemoteEngine(current_utterance_.get());
if (engine_delegate_ && current_utterance_ &&
!current_utterance_->GetEngineId().empty() && !spoken_by_remote_engine) {
engine_delegate_->Pause(current_utterance_.get());
} else if (current_utterance_) {
DCHECK(TtsPlatformReady());
GetTtsPlatform()->ClearError();
GetTtsPlatform()->Pause();
}
}
void TtsControllerImpl::Resume() {
base::RecordAction(base::UserMetricsAction("TextToSpeech.Resume"));
if (!paused_)
return;
paused_ = false;
bool spoken_by_remote_engine =
IsUtteranceSpokenByRemoteEngine(current_utterance_.get());
if (engine_delegate_ && current_utterance_ &&
!current_utterance_->GetEngineId().empty() && !spoken_by_remote_engine) {
engine_delegate_->Resume(current_utterance_.get());
} else if (current_utterance_) {
DCHECK(TtsPlatformReady());
GetTtsPlatform()->ClearError();
GetTtsPlatform()->Resume();
} else {
SpeakNextUtterance();
}
}
void TtsControllerImpl::UninstallLanguageRequest(
content::BrowserContext* browser_context,
const std::string& lang,
const std::string& client_id,
int source,
bool uninstall_immediately) {
if (!engine_delegate_) {
return;
}
engine_delegate_->UninstallLanguageRequest(browser_context, lang, client_id,
source, uninstall_immediately);
}
void TtsControllerImpl::InstallLanguageRequest(BrowserContext* browser_context,
const std::string& lang,
const std::string& client_id,
int source) {
if (!engine_delegate_) {
return;
}
engine_delegate_->InstallLanguageRequest(browser_context, lang, client_id,
source);
}
void TtsControllerImpl::LanguageStatusRequest(BrowserContext* browser_context,
const std::string& lang,
const std::string& client_id,
int source) {
if (!engine_delegate_) {
return;
}
engine_delegate_->LanguageStatusRequest(browser_context, lang, client_id,
source);
}
void TtsControllerImpl::OnTtsEvent(int utterance_id,
TtsEventType event_type,
int char_index,
int length,
const std::string& error_message) {
// We may sometimes receive completion callbacks "late", after we've
// already finished the utterance (for example because another utterance
// interrupted or we got a call to Stop). This is normal and we can
// safely just ignore these events.
if (!current_utterance_ || utterance_id != current_utterance_->GetId()) {
return;
}
UMATextToSpeechEvent metric;
switch (event_type) {
case TTS_EVENT_START:
metric = UMATextToSpeechEvent::START;
break;
case TTS_EVENT_END:
metric = UMATextToSpeechEvent::END;
break;
case TTS_EVENT_WORD:
metric = UMATextToSpeechEvent::WORD;
break;
case TTS_EVENT_SENTENCE:
metric = UMATextToSpeechEvent::SENTENCE;
break;
case TTS_EVENT_MARKER:
metric = UMATextToSpeechEvent::MARKER;
break;
case TTS_EVENT_INTERRUPTED:
metric = UMATextToSpeechEvent::INTERRUPTED;
break;
case TTS_EVENT_CANCELLED:
metric = UMATextToSpeechEvent::CANCELLED;
break;
case TTS_EVENT_ERROR:
metric = UMATextToSpeechEvent::SPEECH_ERROR;
break;
case TTS_EVENT_PAUSE:
metric = UMATextToSpeechEvent::PAUSE;
break;
case TTS_EVENT_RESUME:
metric = UMATextToSpeechEvent::RESUME;
break;
default:
NOTREACHED();
}
UMA_HISTOGRAM_ENUMERATION("TextToSpeech.Event", metric,
UMATextToSpeechEvent::COUNT);
current_utterance_->OnTtsEvent(event_type, char_index, length, error_message);
if (current_utterance_->IsFinished()) {
FinishCurrentUtterance();
SpeakNextUtterance();
}
}
void TtsControllerImpl::OnTtsUtteranceBecameInvalid(int utterance_id) {
#if BUILDFLAG(IS_CHROMEOS)
// This handles the case that the utterance originated from the standalone
// browser becomes invalid, we need to stop
RemoveUtteranceAndStopIfNeeded(utterance_id);
#else
NOTREACHED();
#endif
}
void TtsControllerImpl::GetVoices(BrowserContext* browser_context,
const GURL& source_url,
std::vector<VoiceData>* out_voices) {
// Initialize GetTtsPlatform first, so that engine_delegate_ can be set
// if necessary.
TtsPlatform* tts_platform = GetTtsPlatform();
DCHECK(tts_platform);
// Ensure we have all built-in voices loaded. This is a no-op if already
// loaded.
tts_platform->LoadBuiltInTtsEngine(browser_context);
if (TtsPlatformReady())
tts_platform->GetVoices(out_voices);
if (browser_context && engine_delegate_ &&
engine_delegate_->IsBuiltInTtsEngineInitialized(browser_context)) {
engine_delegate_->GetVoices(browser_context, source_url, out_voices);
}
tts_platform->FinalizeVoiceOrdering(*out_voices);
if (!allow_remote_voices_) {
auto it =
std::remove_if(out_voices->begin(), out_voices->end(),
[](const VoiceData& voice) { return voice.remote; });
out_voices->resize(it - out_voices->begin());
}
}
bool TtsControllerImpl::IsSpeaking() {
return current_utterance_ != nullptr ||
(TtsPlatformReady() && GetTtsPlatform()->IsSpeaking());
}
void TtsControllerImpl::UpdateLanguageStatus(
BrowserContext* browser_context,
const std::string& lang,
LanguageInstallStatus install_status,
const std::string& error) {
if (update_language_status_delegates_.empty()) {
return;
}
for (auto& delegate : update_language_status_delegates_) {
delegate.OnUpdateLanguageStatus(browser_context, lang, install_status,
error);
}
}
void TtsControllerImpl::AddUpdateLanguageStatusDelegate(
UpdateLanguageStatusDelegate* delegate) {
update_language_status_delegates_.AddObserver(delegate);
}
void TtsControllerImpl::RemoveUpdateLanguageStatusDelegate(
UpdateLanguageStatusDelegate* delegate) {
update_language_status_delegates_.RemoveObserver(delegate);
}
void TtsControllerImpl::VoicesChanged() {
if (voices_changed_delegates_.empty() || TtsPlatformLoading())
return;
// Existence of platform tts indicates explicit requests to tts. Since
// |VoicesChanged| can occur implicitly, only send if needed.
for (auto& delegate : voices_changed_delegates_)
delegate.OnVoicesChanged();
if (!current_utterance_ && !utterance_list_.empty())
SpeakNextUtterance();
}
void TtsControllerImpl::AddVoicesChangedDelegate(
VoicesChangedDelegate* delegate) {
voices_changed_delegates_.AddObserver(delegate);
}
void TtsControllerImpl::RemoveVoicesChangedDelegate(
VoicesChangedDelegate* delegate) {
voices_changed_delegates_.RemoveObserver(delegate);
}
void TtsControllerImpl::RemoveUtteranceEventDelegate(
UtteranceEventDelegate* delegate) {
// First clear any pending utterances with this delegate.
std::list<std::unique_ptr<TtsUtterance>> old_list;
utterance_list_.swap(old_list);
while (!old_list.empty()) {
std::unique_ptr<TtsUtterance> utterance = std::move(old_list.front());
old_list.pop_front();
if (utterance->GetEventDelegate() != delegate)
utterance_list_.emplace_back(std::move(utterance));
}
if (current_utterance_ &&
current_utterance_->GetEventDelegate() == delegate) {
current_utterance_->SetEventDelegate(nullptr);
if (engine_delegate_ && !current_utterance_->GetEngineId().empty()) {
engine_delegate_->Stop(current_utterance_.get());
} else {
DCHECK(TtsPlatformReady());
GetTtsPlatform()->ClearError();
GetTtsPlatform()->StopSpeaking();
}
FinishCurrentUtterance();
SpeakNextUtterance();
}
}
void TtsControllerImpl::SetTtsEngineDelegate(TtsEngineDelegate* delegate) {
engine_delegate_ = delegate;
}
TtsEngineDelegate* TtsControllerImpl::GetTtsEngineDelegate() {
return engine_delegate_;
}
void TtsControllerImpl::RefreshVoices() {
GetTtsPlatform()->RefreshVoices();
}
void TtsControllerImpl::Shutdown() {
if (tts_platform_)
tts_platform_->Shutdown();
}
void TtsControllerImpl::OnBrowserContextDestroyed(
BrowserContext* browser_context) {
bool did_clear_utterances = false;
// First clear the BrowserContext from any utterances.
for (std::unique_ptr<TtsUtterance>& utterance : utterance_list_) {
if (utterance->GetBrowserContext() == browser_context) {
utterance->ClearBrowserContext();
did_clear_utterances = true;
}
}
if (current_utterance_ &&
current_utterance_->GetBrowserContext() == browser_context) {
current_utterance_->ClearBrowserContext();
did_clear_utterances = true;
}
// If we cleared the BrowserContext from any utterances, stop speech
// just to be safe. Do this using PostTask because calling Stop might
// try to send notifications and that can trigger code paths that try
// to access the BrowserContext that's being deleted. Note that it's
// safe to use base::Unretained because this is a singleton.
if (did_clear_utterances) {
base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask(
FROM_HERE, base::BindOnce(&TtsControllerImpl::StopAndClearQueue,
base::Unretained(this), GURL()));
}
}
void TtsControllerImpl::SetTtsPlatform(TtsPlatform* tts_platform) {
tts_platform_ = tts_platform;
}
int TtsControllerImpl::QueueSize() {
return static_cast<int>(utterance_list_.size());
}
TtsPlatform* TtsControllerImpl::GetTtsPlatform() {
if (!tts_platform_)
tts_platform_ = TtsPlatform::GetInstance();
DCHECK(tts_platform_);
return tts_platform_;
}
bool TtsControllerImpl::TtsPlatformReady() {
TtsPlatform* tts_platform = GetTtsPlatform();
return tts_platform->PlatformImplSupported() &&
tts_platform->PlatformImplInitialized();
}
bool TtsControllerImpl::TtsPlatformLoading() {
// If the platform implementation is supported, it is considered to be in
// loading state until the platform is inititialized. Typically, that means
// the libraries are loaded and the voices are being loaded.
TtsPlatform* tts_platform = GetTtsPlatform();
return tts_platform->PlatformImplSupported() &&
!tts_platform->PlatformImplInitialized();
}
void TtsControllerImpl::SpeakNow(std::unique_ptr<TtsUtterance> utterance) {
// Get all available voices and try to find a matching voice.
std::vector<VoiceData> voices;
GetVoices(utterance->GetBrowserContext(), utterance->GetSrcUrl(), &voices);
// Get the best matching voice. If nothing matches, just set "native"
// to true because that might trigger deferred loading of native voices.
// TODO(katie): Move most of the GetMatchingVoice logic into content/ and
// use the TTS controller delegate to get chrome-specific info as needed.
int index = GetMatchingVoice(utterance.get(), voices);
VoiceData voice;
if (index >= 0) {
voice = voices[index];
} else {
voice.native = true;
voice.engine_id = utterance->GetEngineId();
voice.name = utterance->GetVoiceName();
voice.lang = utterance->GetLang();
}
UpdateUtteranceDefaults(utterance.get());
GetTtsPlatform()->WillSpeakUtteranceWithVoice(utterance.get(), voice);
base::RecordAction(base::UserMetricsAction("TextToSpeech.Speak"));
UMA_HISTOGRAM_COUNTS_100000("TextToSpeech.Utterance.Rate",
utterance->GetContinuousParameters().rate);
UMA_HISTOGRAM_COUNTS_100000("TextToSpeech.Utterance.TextLength",
utterance->GetText().size());
UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.FromExtensionAPI",
!utterance->GetSrcUrl().is_empty());
UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.HasVoiceName",
!utterance->GetVoiceName().empty());
UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.Native", voice.native);
if (!voice.native) {
#if !BUILDFLAG(IS_ANDROID)
DCHECK(!voice.engine_id.empty());
SetCurrentUtterance(std::move(utterance));
current_utterance_->SetEngineId(voice.engine_id);
if (engine_delegate_) {
engine_delegate_->Speak(current_utterance_.get(), voice);
}
bool sends_end_event =
voice.events.find(TTS_EVENT_END) != voice.events.end();
if (!sends_end_event) {
current_utterance_->Finish();
SetCurrentUtterance(nullptr);
SpeakNextUtterance();
}
#endif // !BUILDFLAG(IS_ANDROID)
} else {
// It's possible for certain platforms to send start events immediately
// during |speak|.
SetCurrentUtterance(std::move(utterance));
if (TtsPlatformReady()) {
GetTtsPlatform()->ClearError();
GetTtsPlatform()->Speak(
current_utterance_->GetId(), current_utterance_->GetText(),
current_utterance_->GetLang(), voice,
current_utterance_->GetContinuousParameters(),
base::BindOnce(&TtsControllerImpl::OnSpeakFinished,
base::Unretained(this), current_utterance_->GetId()));
} else {
// The TTS platform is not supported.
OnSpeakFinished(current_utterance_->GetId(), false);
}
}
}
void TtsControllerImpl::OnSpeakFinished(int utterance_id, bool success) {
if (success)
return;
// Since OnSpeakFinished could run asynchronously, it is possible that the
// current utterance has changed. Ignore any such spurious callbacks.
if (!current_utterance_ || current_utterance_->GetId() != utterance_id)
return;
// If the native voice wasn't able to process this speech, see if the browser
// has built-in TTS that crashed and needs re-loading or the utterance came
// from a profile that no longer exists e.g. login.
// The controller only ends up here if we had at some point completely
// initialized native tts and tts engine delegate (see SpeakOrEnqueue), so
// drop the utterance from re-processing.
GetTtsPlatform()->LoadBuiltInTtsEngine(
current_utterance_->GetBrowserContext());
current_utterance_->OnTtsEvent(TTS_EVENT_ERROR, kInvalidCharIndex,
kInvalidLength, GetTtsPlatform()->GetError());
SetCurrentUtterance(nullptr);
}
void TtsControllerImpl::ClearUtteranceQueue(bool send_events) {
while (!utterance_list_.empty()) {
std::unique_ptr<TtsUtterance> utterance =
std::move(utterance_list_.front());
utterance_list_.pop_front();
if (send_events) {
utterance->OnTtsEvent(TTS_EVENT_CANCELLED, kInvalidCharIndex,
kInvalidLength, std::string());
} else {
utterance->Finish();
}
}
}
void TtsControllerImpl::FinishCurrentUtterance() {
if (!current_utterance_)
return;
if (!current_utterance_->IsFinished()) {
current_utterance_->OnTtsEvent(TTS_EVENT_INTERRUPTED, kInvalidCharIndex,
kInvalidLength, std::string());
}
SetCurrentUtterance(nullptr);
}
void TtsControllerImpl::SpeakNextUtterance() {
if (paused_)
return;
// Start speaking the next utterance in the queue. Keep trying in case
// one fails but there are still more in the queue to try.
TtsUtterance* previous_utterance = nullptr;
while (!utterance_list_.empty() && !current_utterance_) {
std::unique_ptr<TtsUtterance> utterance =
std::move(utterance_list_.front());
utterance_list_.pop_front();
DCHECK(previous_utterance != utterance.get());
if (ShouldSpeakUtterance(utterance.get()))
SpeakNow(std::move(utterance));
else
utterance->Finish();
previous_utterance = utterance.get();
}
}
void TtsControllerImpl::UpdateUtteranceDefaults(TtsUtterance* utterance) {
double rate = utterance->GetContinuousParameters().rate;
double pitch = utterance->GetContinuousParameters().pitch;
double volume = utterance->GetContinuousParameters().volume;
#if BUILDFLAG(IS_CHROMEOS)
if (GetTtsControllerDelegate())
GetTtsControllerDelegate()->UpdateUtteranceDefaultsFromPrefs(
utterance, &rate, &pitch, &volume);
#else
// Update pitch, rate and volume to defaults if not explicity set on
// this utterance.
if (rate == blink::mojom::kSpeechSynthesisDoublePrefNotSet)
rate = blink::mojom::kSpeechSynthesisDefaultRate;
if (pitch == blink::mojom::kSpeechSynthesisDoublePrefNotSet)
pitch = blink::mojom::kSpeechSynthesisDefaultPitch;
if (volume == blink::mojom::kSpeechSynthesisDoublePrefNotSet)
volume = blink::mojom::kSpeechSynthesisDefaultVolume;
#endif // BUILDFLAG(IS_CHROMEOS)
utterance->SetContinuousParameters(rate, pitch, volume);
}
void TtsControllerImpl::StripSSML(
const std::string& utterance,
base::OnceCallback<void(const std::string&)> on_ssml_parsed) {
// Skip parsing and return if not xml.
if (utterance.find("<?xml") == std::string::npos) {
std::move(on_ssml_parsed).Run(utterance);
return;
}
// Parse using safe, out-of-process Xml Parser.
data_decoder::DataDecoder::ParseXmlIsolated(
utterance,
data_decoder::mojom::XmlParser::WhitespaceBehavior::kPreserveSignificant,
base::BindOnce(&TtsControllerImpl::StripSSMLHelper, utterance,
std::move(on_ssml_parsed)));
}
// Called when ParseXml finishes.
// Uses parsed xml to build parsed utterance text.
void TtsControllerImpl::StripSSMLHelper(
const std::string& utterance,
base::OnceCallback<void(const std::string&)> on_ssml_parsed,
data_decoder::DataDecoder::ValueOrError result) {
// Error checks.
// If invalid xml, return original utterance text.
if (!result.has_value()) {
std::move(on_ssml_parsed).Run(utterance);
return;
}
std::string root_tag_name;
data_decoder::GetXmlElementTagName(*result, &root_tag_name);
// Root element must be <speak>.
if (root_tag_name.compare("speak") != 0) {
std::move(on_ssml_parsed).Run(utterance);
return;
}
std::string parsed_text;
// Change from unique_ptr to base::Value* so recursion will work.
PopulateParsedText(&parsed_text, &*result);
// Run with parsed_text.
std::move(on_ssml_parsed).Run(parsed_text);
}
void TtsControllerImpl::PopulateParsedText(std::string* parsed_text,
const base::Value* element) {
DCHECK(parsed_text);
if (!element || !element->is_dict()) {
return;
}
// Add element's text if present.
// Note: We don't use data_decoder::GetXmlElementText because it gets the text
// of element's first child, not text of current element.
const std::string* text_value =
element->GetDict().FindString(data_decoder::mojom::XmlParser::kTextKey);
if (text_value)
*parsed_text += *text_value;
const base::Value::List* children =
data_decoder::GetXmlElementChildren(*element);
if (!children) {
return;
}
for (const auto& entry : *children) {
// We need to iterate over all children because some text elements are
// nested within other types of elements, such as <emphasis> tags.
PopulateParsedText(parsed_text, &entry);
}
}
int TtsControllerImpl::GetMatchingVoice(TtsUtterance* utterance,
const std::vector<VoiceData>& voices) {
const std::string app_lang =
GetContentClient()->browser()->GetApplicationLocale();
// Start with a best score of -1, that way even if none of the criteria
// match, something will be returned if there are any voices.
int best_score = -1;
int best_score_index = -1;
#if BUILDFLAG(IS_CHROMEOS)
TtsControllerDelegate* delegate = GetTtsControllerDelegate();
std::unique_ptr<TtsControllerDelegate::PreferredVoiceIds> preferred_ids =
delegate ? delegate->GetPreferredVoiceIdsForUtterance(utterance)
: nullptr;
#endif // BUILDFLAG(IS_CHROMEOS)
for (size_t i = 0; i < voices.size(); ++i) {
const content::VoiceData& voice = voices[i];
int score = 0;
// If the extension ID is specified, check for an exact match.
if (!utterance->GetEngineId().empty() &&
utterance->GetEngineId() != voice.engine_id)
continue;
// If the voice name is specified, check for an exact match.
if (!utterance->GetVoiceName().empty() &&
voice.name != utterance->GetVoiceName())
continue;
// Prefer the utterance language.
if (!voice.lang.empty() && !utterance->GetLang().empty()) {
std::string voice_language =
base::ToLowerASCII(l10n_util::GetLanguage(voice.lang));
std::string voice_country =
base::ToLowerASCII(l10n_util::GetCountry(voice.lang));
std::string utterance_language =
base::ToLowerASCII(l10n_util::GetLanguage(utterance->GetLang()));
std::string utterance_country =
base::ToLowerASCII(l10n_util::GetCountry(utterance->GetLang()));
// An exact locale match is worth more than a partial match.
// Convert locales to lowercase to handle cases like "en-us" vs. "en-US".
// Cases where language and country match should score the same as an
// exact match.
if (voice_language == utterance_language &&
(voice_country == utterance_country ||
(utterance_country.empty() && voice_language == voice_country) ||
(voice_country.empty() &&
utterance_language == utterance_country))) {
score += 128;
} else if (voice_language == utterance_language) {
score += 64;
}
}
// Next, prefer required event types.
if (!utterance->GetRequiredEventTypes().empty()) {
bool has_all_required_event_types = true;
for (TtsEventType event_type : utterance->GetRequiredEventTypes()) {
if (voice.events.find(event_type) == voice.events.end()) {
has_all_required_event_types = false;
break;
}
}
if (has_all_required_event_types)
score += 32;
}
#if BUILDFLAG(IS_CHROMEOS)
if (preferred_ids) {
// First prefer the user's preference voice for the utterance language,
// if the utterance language is specified.
if (!utterance->GetLang().empty() &&
VoiceIdMatches(preferred_ids->lang_voice_id, voice)) {
score += 16;
}
// Then prefer the user's preference voice for the system language.
// This is a lower priority match than the utterance voice.
if (VoiceIdMatches(preferred_ids->locale_voice_id, voice))
score += 8;
// Finally, prefer the user's preference voice for any language. This will
// pick the default voice if there is no better match for the current
// system language and utterance language.
if (VoiceIdMatches(preferred_ids->any_locale_voice_id, voice))
score += 4;
}
#endif // BUILDFLAG(IS_CHROMEOS)
// Finally, prefer system language.
if (!voice.lang.empty()) {
if (voice.lang == app_lang) {
score += 2;
} else if (base::EqualsCaseInsensitiveASCII(
l10n_util::GetLanguage(voice.lang),
l10n_util::GetLanguage(app_lang))) {
score += 1;
}
}
if (score > best_score) {
best_score = score;
best_score_index = i;
}
}
return best_score_index;
}
void TtsControllerImpl::SetCurrentUtterance(
std::unique_ptr<TtsUtterance> utterance) {
current_utterance_ = std::move(utterance);
Observe(current_utterance_
? AsUtteranceImpl(current_utterance_.get())->GetWebContents()
: nullptr);
}
void TtsControllerImpl::StopCurrentUtteranceAndRemoveUtterancesMatching(
WebContents* wc) {
DCHECK(wc);
// Removes any utterances that match the WebContents from the current
// utterance (which our inherited WebContentsObserver starts observing every
// time the utterance changes).
//
// This is called when the WebContents for the current utterance is destroyed
// or hidden. In the case where it's destroyed, this is done to avoid
// attempting to start a utterance that is very likely to be destroyed right
// away, and there are also subtle timing issues if we didn't do this (if a
// queued utterance has already received WebContentsDestroyed(), and we start
// it, we won't get the corresponding WebContentsDestroyed()).
auto eraser = [wc](const std::unique_ptr<TtsUtterance>& utterance) {
TtsUtteranceImpl* utterance_impl = AsUtteranceImpl(utterance.get());
if (utterance_impl->GetWebContents() == wc) {
utterance_impl->Finish();
return true;
}
return false;
};
utterance_list_.erase(
std::remove_if(utterance_list_.begin(), utterance_list_.end(), eraser),
utterance_list_.end());
const bool stopped = StopCurrentUtteranceIfMatches(GURL());
DCHECK(stopped);
SpeakNextUtterance();
}
void TtsControllerImpl::RemoveUtteranceAndStopIfNeeded(int utterance_id) {
for (std::list<std::unique_ptr<TtsUtterance>>::iterator it =
utterance_list_.begin();
it != utterance_list_.end(); ++it) {
if ((*it)->GetId() == utterance_id) {
TtsUtteranceImpl* utterance_impl = AsUtteranceImpl((*it).get());
utterance_impl->Finish();
utterance_list_.erase(it);
break;
}
}
const bool stopped = StopCurrentUtteranceIfMatches(utterance_id);
if (stopped)
SpeakNextUtterance();
}
bool TtsControllerImpl::StopCurrentUtteranceIfMatches(int utterance_id) {
paused_ = false;
if (current_utterance_->GetId() != utterance_id)
return false;
StopCurrentUtterance();
return true;
}
bool TtsControllerImpl::ShouldSpeakUtterance(TtsUtterance* utterance) {
TtsUtteranceImpl* utterance_impl = AsUtteranceImpl(utterance);
if (!utterance_impl->was_created_with_web_contents() ||
utterance_impl->ShouldAlwaysBeSpoken()) {
return true;
}
// If the WebContents that created the utterance has been destroyed, don't
// speak it.
if (!utterance_impl->GetWebContents())
return false;
// Allow speaking if either the WebContents is visible, or the WebContents
// isn't required to be visible before speaking.
return !stop_speaking_when_hidden_ ||
utterance_impl->GetWebContents()->GetVisibility() !=
Visibility::HIDDEN;
}
//
// WebContentsObserver
//
void TtsControllerImpl::WebContentsDestroyed() {
StopCurrentUtteranceAndRemoveUtterancesMatching(web_contents());
}
void TtsControllerImpl::OnVisibilityChanged(Visibility visibility) {
if (visibility == Visibility::HIDDEN && stop_speaking_when_hidden_)
StopCurrentUtteranceAndRemoveUtterancesMatching(web_contents());
}
void TtsControllerImpl::OnNetworkChanged(
net::NetworkChangeNotifier::ConnectionType type) {
switch (type) {
// Non-cellular connections.
case net::NetworkChangeNotifier::ConnectionType::CONNECTION_UNKNOWN:
case net::NetworkChangeNotifier::ConnectionType::CONNECTION_ETHERNET:
case net::NetworkChangeNotifier::ConnectionType::CONNECTION_WIFI:
case net::NetworkChangeNotifier::ConnectionType::CONNECTION_BLUETOOTH:
allow_remote_voices_ = true;
break;
// Cellular connections.
case net::NetworkChangeNotifier::ConnectionType::CONNECTION_2G:
case net::NetworkChangeNotifier::ConnectionType::CONNECTION_3G:
case net::NetworkChangeNotifier::ConnectionType::CONNECTION_4G:
case net::NetworkChangeNotifier::ConnectionType::CONNECTION_NONE:
case net::NetworkChangeNotifier::ConnectionType::CONNECTION_5G:
allow_remote_voices_ = false;
}
}
#if BUILDFLAG(IS_CHROMEOS)
TtsControllerDelegate* TtsControllerImpl::GetTtsControllerDelegate() {
if (delegate_)
return delegate_;
if (GetContentClient() && GetContentClient()->browser()) {
delegate_ = GetContentClient()->browser()->GetTtsControllerDelegate();
return delegate_;
}
return nullptr;
}
void TtsControllerImpl::SetTtsControllerDelegateForTesting(
TtsControllerDelegate* delegate) {
delegate_ = delegate;
}
#endif // BUILDFLAG(IS_CHROMEOS)
} // namespace content