| // Copyright 2018 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "content/browser/speech/tts_controller_impl.h" |
| |
| #include <stddef.h> |
| |
| #include <algorithm> |
| #include <string> |
| #include <vector> |
| |
| #include "base/containers/queue.h" |
| #include "base/functional/bind.h" |
| #include "base/json/json_reader.h" |
| #include "base/metrics/histogram_macros.h" |
| #include "base/metrics/user_metrics.h" |
| #include "base/observer_list.h" |
| #include "base/strings/string_util.h" |
| #include "base/task/single_thread_task_runner.h" |
| #include "base/values.h" |
| #include "build/build_config.h" |
| #include "content/browser/speech/tts_utterance_impl.h" |
| #include "content/public/browser/content_browser_client.h" |
| #include "content/public/browser/tts_utterance.h" |
| #include "content/public/browser/visibility.h" |
| #include "content/public/browser/web_contents.h" |
| #include "content/public/common/content_client.h" |
| #include "services/data_decoder/public/cpp/safe_xml_parser.h" |
| #include "services/data_decoder/public/mojom/xml_parser.mojom.h" |
| #include "third_party/blink/public/mojom/speech/speech_synthesis.mojom.h" |
| #include "ui/base/l10n/l10n_util.h" |
| |
| #if BUILDFLAG(IS_CHROMEOS) |
| #include "content/public/browser/tts_controller_delegate.h" |
| #endif |
| |
| namespace content { |
| namespace { |
| // A value to be used to indicate that there is no char index available. |
| const int kInvalidCharIndex = -1; |
| |
| // A value to be used to indicate that there is no length available. |
| const int kInvalidLength = -1; |
| |
| #if BUILDFLAG(IS_CHROMEOS) |
| bool VoiceIdMatches( |
| const std::optional<TtsControllerDelegate::PreferredVoiceId>& id, |
| const content::VoiceData& voice) { |
| if (!id.has_value() || voice.name.empty() || |
| (voice.engine_id.empty() && !voice.native)) |
| return false; |
| if (voice.native) |
| return id->name == voice.name && id->id.empty(); |
| return id->name == voice.name && id->id == voice.engine_id; |
| } |
| #endif // BUILDFLAG(IS_CHROMEOS) |
| |
| TtsUtteranceImpl* AsUtteranceImpl(TtsUtterance* utterance) { |
| return static_cast<TtsUtteranceImpl*>(utterance); |
| } |
| |
| bool IsUtteranceSpokenByRemoteEngine(TtsUtterance* utterance) { |
| if (utterance && !utterance->GetEngineId().empty()) { |
| TtsUtteranceImpl* utterance_impl = AsUtteranceImpl(utterance); |
| return utterance_impl->spoken_by_remote_engine(); |
| } |
| return false; |
| } |
| |
| } // namespace |
| |
| // |
| // VoiceData |
| // |
| |
| VoiceData::VoiceData() : remote(false), native(false) {} |
| |
| VoiceData::VoiceData(const VoiceData& other) = default; |
| |
| VoiceData::~VoiceData() {} |
| |
| // |
| // TtsController |
| // |
| |
| TtsController* TtsController::GetInstance() { |
| return TtsControllerImpl::GetInstance(); |
| } |
| |
| void TtsController::SkipAddNetworkChangeObserverForTests(bool enabled) { |
| return TtsControllerImpl::SkipAddNetworkChangeObserverForTests(enabled); |
| } |
| |
| // IMPORTANT! |
| // These values are written to logs. Do not renumber or delete |
| // existing items; add new entries to the end of the list. |
| // LINT.IfChange(UMATextToSpeechEvent) |
| enum class UMATextToSpeechEvent { |
| START = 0, |
| END = 1, |
| WORD = 2, |
| SENTENCE = 3, |
| MARKER = 4, |
| INTERRUPTED = 5, |
| CANCELLED = 6, |
| SPEECH_ERROR = 7, |
| PAUSE = 8, |
| RESUME = 9, |
| |
| // This must always be the last enum. It's okay for its value to |
| // increase, but none of the other enum values may change. |
| COUNT |
| }; |
| // LINT.ThenChange(/tools/metrics/histograms/metadata/accessibility/enums.xml:TextToSpeechEvent) |
| |
| // |
| // TtsControllerImpl |
| // |
| |
| // static |
| bool TtsControllerImpl::skip_add_network_change_observer_for_tests_ = false; |
| |
| // static |
| TtsControllerImpl* TtsControllerImpl::GetInstance() { |
| return base::Singleton<TtsControllerImpl>::get(); |
| } |
| |
| // static |
| void TtsControllerImpl::SkipAddNetworkChangeObserverForTests(bool enabled) { |
| TtsControllerImpl::skip_add_network_change_observer_for_tests_ = enabled; |
| } |
| |
| void TtsControllerImpl::SetStopSpeakingWhenHidden(bool value) { |
| stop_speaking_when_hidden_ = value; |
| } |
| |
| TtsControllerImpl::TtsControllerImpl() { |
| if (!skip_add_network_change_observer_for_tests_) { |
| net::NetworkChangeNotifier::AddNetworkChangeObserver(this); |
| } |
| OnNetworkChanged(net::NetworkChangeNotifier::GetConnectionType()); |
| } |
| |
| TtsControllerImpl::~TtsControllerImpl() { |
| if (current_utterance_) { |
| current_utterance_->Finish(); |
| SetCurrentUtterance(nullptr); |
| } |
| |
| // Clear any queued utterances too. |
| ClearUtteranceQueue(false); // Don't sent events. |
| |
| net::NetworkChangeNotifier::RemoveNetworkChangeObserver(this); |
| } |
| |
| void TtsControllerImpl::SpeakOrEnqueue( |
| std::unique_ptr<TtsUtterance> utterance) { |
| if (!ShouldSpeakUtterance(utterance.get())) { |
| utterance->Finish(); |
| return; |
| } |
| |
| // If the TTS platform or tts engine delegate is still loading or |
| // initializing, queue or flush the utterance. The utterances can be sent to |
| // platform specific implementation or to the engine implementation. Every |
| // utterances are postponed until the platform specific implementation and |
| // built in tts engine are loaded to avoid races where the utterance gets |
| // dropped unexpectedly. |
| if (TtsPlatformLoading() || |
| (engine_delegate_ && !engine_delegate_->IsBuiltInTtsEngineInitialized( |
| utterance->GetBrowserContext()))) { |
| GetTtsPlatform()->LoadBuiltInTtsEngine(utterance->GetBrowserContext()); |
| |
| if (utterance->GetShouldClearQueue()) |
| ClearUtteranceQueue(true); |
| |
| utterance_list_.emplace_back(std::move(utterance)); |
| return; |
| } |
| |
| // If we're paused and we get an utterance that can't be queued, |
| // flush the queue but stay in the paused state. |
| if (paused_ && utterance->GetShouldClearQueue()) { |
| Stop(); |
| utterance_list_.emplace_back(std::move(utterance)); |
| paused_ = true; |
| return; |
| } |
| |
| if (paused_ || (IsSpeaking() && !utterance->GetShouldClearQueue())) { |
| utterance_list_.emplace_back(std::move(utterance)); |
| } else { |
| Stop(); |
| SpeakNow(std::move(utterance)); |
| } |
| } |
| |
| void TtsControllerImpl::Stop() { |
| Stop(GURL()); |
| } |
| |
| void TtsControllerImpl::Stop(const GURL& source_url) { |
| StopAndClearQueue(source_url); |
| } |
| |
| void TtsControllerImpl::StopAndClearQueue(const GURL& source_url) { |
| if (StopCurrentUtteranceIfMatches(source_url)) |
| ClearUtteranceQueue(true); |
| } |
| |
| bool TtsControllerImpl::StopCurrentUtteranceIfMatches(const GURL& source_url) { |
| base::RecordAction(base::UserMetricsAction("TextToSpeech.Stop")); |
| |
| paused_ = false; |
| |
| if (!source_url.is_empty() && current_utterance_ && |
| current_utterance_->GetSrcUrl().DeprecatedGetOriginAsURL() != |
| source_url.DeprecatedGetOriginAsURL()) |
| return false; |
| |
| StopCurrentUtterance(); |
| return true; |
| } |
| |
| void TtsControllerImpl::StopCurrentUtterance() { |
| bool spoken_by_remote_engine = |
| IsUtteranceSpokenByRemoteEngine(current_utterance_.get()); |
| if (engine_delegate_ && current_utterance_ && |
| !current_utterance_->GetEngineId().empty() && !spoken_by_remote_engine) { |
| engine_delegate_->Stop(current_utterance_.get()); |
| } else if (TtsPlatformReady()) { |
| GetTtsPlatform()->ClearError(); |
| GetTtsPlatform()->StopSpeaking(); |
| } |
| |
| if (current_utterance_) { |
| current_utterance_->OnTtsEvent(TTS_EVENT_INTERRUPTED, kInvalidCharIndex, |
| kInvalidLength, std::string()); |
| } |
| |
| FinishCurrentUtterance(); |
| } |
| |
| void TtsControllerImpl::Pause() { |
| base::RecordAction(base::UserMetricsAction("TextToSpeech.Pause")); |
| |
| if (paused_) |
| return; |
| |
| paused_ = true; |
| bool spoken_by_remote_engine = |
| IsUtteranceSpokenByRemoteEngine(current_utterance_.get()); |
| if (engine_delegate_ && current_utterance_ && |
| !current_utterance_->GetEngineId().empty() && !spoken_by_remote_engine) { |
| engine_delegate_->Pause(current_utterance_.get()); |
| } else if (current_utterance_) { |
| DCHECK(TtsPlatformReady()); |
| GetTtsPlatform()->ClearError(); |
| GetTtsPlatform()->Pause(); |
| } |
| } |
| |
| void TtsControllerImpl::Resume() { |
| base::RecordAction(base::UserMetricsAction("TextToSpeech.Resume")); |
| |
| if (!paused_) |
| return; |
| |
| paused_ = false; |
| bool spoken_by_remote_engine = |
| IsUtteranceSpokenByRemoteEngine(current_utterance_.get()); |
| if (engine_delegate_ && current_utterance_ && |
| !current_utterance_->GetEngineId().empty() && !spoken_by_remote_engine) { |
| engine_delegate_->Resume(current_utterance_.get()); |
| } else if (current_utterance_) { |
| DCHECK(TtsPlatformReady()); |
| GetTtsPlatform()->ClearError(); |
| GetTtsPlatform()->Resume(); |
| } else { |
| SpeakNextUtterance(); |
| } |
| } |
| |
| void TtsControllerImpl::UninstallLanguageRequest( |
| content::BrowserContext* browser_context, |
| const std::string& lang, |
| const std::string& client_id, |
| int source, |
| bool uninstall_immediately) { |
| if (!engine_delegate_) { |
| return; |
| } |
| |
| engine_delegate_->UninstallLanguageRequest(browser_context, lang, client_id, |
| source, uninstall_immediately); |
| } |
| |
| void TtsControllerImpl::InstallLanguageRequest(BrowserContext* browser_context, |
| const std::string& lang, |
| const std::string& client_id, |
| int source) { |
| if (!engine_delegate_) { |
| return; |
| } |
| |
| engine_delegate_->InstallLanguageRequest(browser_context, lang, client_id, |
| source); |
| } |
| |
| void TtsControllerImpl::LanguageStatusRequest(BrowserContext* browser_context, |
| const std::string& lang, |
| const std::string& client_id, |
| int source) { |
| if (!engine_delegate_) { |
| return; |
| } |
| |
| engine_delegate_->LanguageStatusRequest(browser_context, lang, client_id, |
| source); |
| } |
| |
| void TtsControllerImpl::OnTtsEvent(int utterance_id, |
| TtsEventType event_type, |
| int char_index, |
| int length, |
| const std::string& error_message) { |
| // We may sometimes receive completion callbacks "late", after we've |
| // already finished the utterance (for example because another utterance |
| // interrupted or we got a call to Stop). This is normal and we can |
| // safely just ignore these events. |
| if (!current_utterance_ || utterance_id != current_utterance_->GetId()) { |
| return; |
| } |
| |
| UMATextToSpeechEvent metric; |
| switch (event_type) { |
| case TTS_EVENT_START: |
| metric = UMATextToSpeechEvent::START; |
| break; |
| case TTS_EVENT_END: |
| metric = UMATextToSpeechEvent::END; |
| break; |
| case TTS_EVENT_WORD: |
| metric = UMATextToSpeechEvent::WORD; |
| break; |
| case TTS_EVENT_SENTENCE: |
| metric = UMATextToSpeechEvent::SENTENCE; |
| break; |
| case TTS_EVENT_MARKER: |
| metric = UMATextToSpeechEvent::MARKER; |
| break; |
| case TTS_EVENT_INTERRUPTED: |
| metric = UMATextToSpeechEvent::INTERRUPTED; |
| break; |
| case TTS_EVENT_CANCELLED: |
| metric = UMATextToSpeechEvent::CANCELLED; |
| break; |
| case TTS_EVENT_ERROR: |
| metric = UMATextToSpeechEvent::SPEECH_ERROR; |
| break; |
| case TTS_EVENT_PAUSE: |
| metric = UMATextToSpeechEvent::PAUSE; |
| break; |
| case TTS_EVENT_RESUME: |
| metric = UMATextToSpeechEvent::RESUME; |
| break; |
| default: |
| NOTREACHED(); |
| } |
| UMA_HISTOGRAM_ENUMERATION("TextToSpeech.Event", metric, |
| UMATextToSpeechEvent::COUNT); |
| |
| current_utterance_->OnTtsEvent(event_type, char_index, length, error_message); |
| if (current_utterance_->IsFinished()) { |
| FinishCurrentUtterance(); |
| SpeakNextUtterance(); |
| } |
| } |
| |
| void TtsControllerImpl::OnTtsUtteranceBecameInvalid(int utterance_id) { |
| #if BUILDFLAG(IS_CHROMEOS) |
| // This handles the case that the utterance originated from the standalone |
| // browser becomes invalid, we need to stop |
| RemoveUtteranceAndStopIfNeeded(utterance_id); |
| #else |
| NOTREACHED(); |
| #endif |
| } |
| |
| void TtsControllerImpl::GetVoices(BrowserContext* browser_context, |
| const GURL& source_url, |
| std::vector<VoiceData>* out_voices) { |
| // Initialize GetTtsPlatform first, so that engine_delegate_ can be set |
| // if necessary. |
| TtsPlatform* tts_platform = GetTtsPlatform(); |
| |
| DCHECK(tts_platform); |
| // Ensure we have all built-in voices loaded. This is a no-op if already |
| // loaded. |
| tts_platform->LoadBuiltInTtsEngine(browser_context); |
| if (TtsPlatformReady()) |
| tts_platform->GetVoices(out_voices); |
| |
| if (browser_context && engine_delegate_ && |
| engine_delegate_->IsBuiltInTtsEngineInitialized(browser_context)) { |
| engine_delegate_->GetVoices(browser_context, source_url, out_voices); |
| } |
| |
| tts_platform->FinalizeVoiceOrdering(*out_voices); |
| |
| if (!allow_remote_voices_) { |
| auto it = |
| std::remove_if(out_voices->begin(), out_voices->end(), |
| [](const VoiceData& voice) { return voice.remote; }); |
| out_voices->resize(it - out_voices->begin()); |
| } |
| } |
| |
| bool TtsControllerImpl::IsSpeaking() { |
| return current_utterance_ != nullptr || |
| (TtsPlatformReady() && GetTtsPlatform()->IsSpeaking()); |
| } |
| |
| void TtsControllerImpl::UpdateLanguageStatus( |
| BrowserContext* browser_context, |
| const std::string& lang, |
| LanguageInstallStatus install_status, |
| const std::string& error) { |
| if (update_language_status_delegates_.empty()) { |
| return; |
| } |
| |
| for (auto& delegate : update_language_status_delegates_) { |
| delegate.OnUpdateLanguageStatus(browser_context, lang, install_status, |
| error); |
| } |
| } |
| |
| void TtsControllerImpl::AddUpdateLanguageStatusDelegate( |
| UpdateLanguageStatusDelegate* delegate) { |
| update_language_status_delegates_.AddObserver(delegate); |
| } |
| |
| void TtsControllerImpl::RemoveUpdateLanguageStatusDelegate( |
| UpdateLanguageStatusDelegate* delegate) { |
| update_language_status_delegates_.RemoveObserver(delegate); |
| } |
| |
| void TtsControllerImpl::VoicesChanged() { |
| if (voices_changed_delegates_.empty() || TtsPlatformLoading()) |
| return; |
| |
| // Existence of platform tts indicates explicit requests to tts. Since |
| // |VoicesChanged| can occur implicitly, only send if needed. |
| for (auto& delegate : voices_changed_delegates_) |
| delegate.OnVoicesChanged(); |
| |
| if (!current_utterance_ && !utterance_list_.empty()) |
| SpeakNextUtterance(); |
| } |
| |
| void TtsControllerImpl::AddVoicesChangedDelegate( |
| VoicesChangedDelegate* delegate) { |
| voices_changed_delegates_.AddObserver(delegate); |
| } |
| |
| void TtsControllerImpl::RemoveVoicesChangedDelegate( |
| VoicesChangedDelegate* delegate) { |
| voices_changed_delegates_.RemoveObserver(delegate); |
| } |
| |
| void TtsControllerImpl::RemoveUtteranceEventDelegate( |
| UtteranceEventDelegate* delegate) { |
| // First clear any pending utterances with this delegate. |
| std::list<std::unique_ptr<TtsUtterance>> old_list; |
| utterance_list_.swap(old_list); |
| while (!old_list.empty()) { |
| std::unique_ptr<TtsUtterance> utterance = std::move(old_list.front()); |
| old_list.pop_front(); |
| if (utterance->GetEventDelegate() != delegate) |
| utterance_list_.emplace_back(std::move(utterance)); |
| } |
| |
| if (current_utterance_ && |
| current_utterance_->GetEventDelegate() == delegate) { |
| current_utterance_->SetEventDelegate(nullptr); |
| if (engine_delegate_ && !current_utterance_->GetEngineId().empty()) { |
| engine_delegate_->Stop(current_utterance_.get()); |
| } else { |
| DCHECK(TtsPlatformReady()); |
| GetTtsPlatform()->ClearError(); |
| GetTtsPlatform()->StopSpeaking(); |
| } |
| |
| FinishCurrentUtterance(); |
| SpeakNextUtterance(); |
| } |
| } |
| |
| void TtsControllerImpl::SetTtsEngineDelegate(TtsEngineDelegate* delegate) { |
| engine_delegate_ = delegate; |
| } |
| |
| TtsEngineDelegate* TtsControllerImpl::GetTtsEngineDelegate() { |
| return engine_delegate_; |
| } |
| |
| void TtsControllerImpl::RefreshVoices() { |
| GetTtsPlatform()->RefreshVoices(); |
| } |
| |
| void TtsControllerImpl::Shutdown() { |
| if (tts_platform_) |
| tts_platform_->Shutdown(); |
| } |
| |
| void TtsControllerImpl::OnBrowserContextDestroyed( |
| BrowserContext* browser_context) { |
| bool did_clear_utterances = false; |
| |
| // First clear the BrowserContext from any utterances. |
| for (std::unique_ptr<TtsUtterance>& utterance : utterance_list_) { |
| if (utterance->GetBrowserContext() == browser_context) { |
| utterance->ClearBrowserContext(); |
| did_clear_utterances = true; |
| } |
| } |
| |
| if (current_utterance_ && |
| current_utterance_->GetBrowserContext() == browser_context) { |
| current_utterance_->ClearBrowserContext(); |
| did_clear_utterances = true; |
| } |
| |
| // If we cleared the BrowserContext from any utterances, stop speech |
| // just to be safe. Do this using PostTask because calling Stop might |
| // try to send notifications and that can trigger code paths that try |
| // to access the BrowserContext that's being deleted. Note that it's |
| // safe to use base::Unretained because this is a singleton. |
| if (did_clear_utterances) { |
| base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask( |
| FROM_HERE, base::BindOnce(&TtsControllerImpl::StopAndClearQueue, |
| base::Unretained(this), GURL())); |
| } |
| } |
| |
| void TtsControllerImpl::SetTtsPlatform(TtsPlatform* tts_platform) { |
| tts_platform_ = tts_platform; |
| } |
| |
| int TtsControllerImpl::QueueSize() { |
| return static_cast<int>(utterance_list_.size()); |
| } |
| |
| TtsPlatform* TtsControllerImpl::GetTtsPlatform() { |
| if (!tts_platform_) |
| tts_platform_ = TtsPlatform::GetInstance(); |
| DCHECK(tts_platform_); |
| return tts_platform_; |
| } |
| |
| bool TtsControllerImpl::TtsPlatformReady() { |
| TtsPlatform* tts_platform = GetTtsPlatform(); |
| return tts_platform->PlatformImplSupported() && |
| tts_platform->PlatformImplInitialized(); |
| } |
| |
| bool TtsControllerImpl::TtsPlatformLoading() { |
| // If the platform implementation is supported, it is considered to be in |
| // loading state until the platform is inititialized. Typically, that means |
| // the libraries are loaded and the voices are being loaded. |
| TtsPlatform* tts_platform = GetTtsPlatform(); |
| return tts_platform->PlatformImplSupported() && |
| !tts_platform->PlatformImplInitialized(); |
| } |
| |
| void TtsControllerImpl::SpeakNow(std::unique_ptr<TtsUtterance> utterance) { |
| // Get all available voices and try to find a matching voice. |
| std::vector<VoiceData> voices; |
| GetVoices(utterance->GetBrowserContext(), utterance->GetSrcUrl(), &voices); |
| |
| // Get the best matching voice. If nothing matches, just set "native" |
| // to true because that might trigger deferred loading of native voices. |
| // TODO(katie): Move most of the GetMatchingVoice logic into content/ and |
| // use the TTS controller delegate to get chrome-specific info as needed. |
| int index = GetMatchingVoice(utterance.get(), voices); |
| VoiceData voice; |
| if (index >= 0) { |
| voice = voices[index]; |
| } else { |
| voice.native = true; |
| voice.engine_id = utterance->GetEngineId(); |
| voice.name = utterance->GetVoiceName(); |
| voice.lang = utterance->GetLang(); |
| } |
| |
| UpdateUtteranceDefaults(utterance.get()); |
| |
| GetTtsPlatform()->WillSpeakUtteranceWithVoice(utterance.get(), voice); |
| |
| base::RecordAction(base::UserMetricsAction("TextToSpeech.Speak")); |
| UMA_HISTOGRAM_COUNTS_100000("TextToSpeech.Utterance.Rate", |
| utterance->GetContinuousParameters().rate); |
| UMA_HISTOGRAM_COUNTS_100000("TextToSpeech.Utterance.TextLength", |
| utterance->GetText().size()); |
| UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.FromExtensionAPI", |
| !utterance->GetSrcUrl().is_empty()); |
| UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.HasVoiceName", |
| !utterance->GetVoiceName().empty()); |
| UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.Native", voice.native); |
| |
| if (!voice.native) { |
| #if !BUILDFLAG(IS_ANDROID) |
| DCHECK(!voice.engine_id.empty()); |
| SetCurrentUtterance(std::move(utterance)); |
| current_utterance_->SetEngineId(voice.engine_id); |
| if (engine_delegate_) { |
| engine_delegate_->Speak(current_utterance_.get(), voice); |
| } |
| |
| bool sends_end_event = |
| voice.events.find(TTS_EVENT_END) != voice.events.end(); |
| if (!sends_end_event) { |
| current_utterance_->Finish(); |
| SetCurrentUtterance(nullptr); |
| SpeakNextUtterance(); |
| } |
| #endif // !BUILDFLAG(IS_ANDROID) |
| } else { |
| // It's possible for certain platforms to send start events immediately |
| // during |speak|. |
| SetCurrentUtterance(std::move(utterance)); |
| if (TtsPlatformReady()) { |
| GetTtsPlatform()->ClearError(); |
| GetTtsPlatform()->Speak( |
| current_utterance_->GetId(), current_utterance_->GetText(), |
| current_utterance_->GetLang(), voice, |
| current_utterance_->GetContinuousParameters(), |
| base::BindOnce(&TtsControllerImpl::OnSpeakFinished, |
| base::Unretained(this), current_utterance_->GetId())); |
| } else { |
| // The TTS platform is not supported. |
| OnSpeakFinished(current_utterance_->GetId(), false); |
| } |
| } |
| } |
| |
| void TtsControllerImpl::OnSpeakFinished(int utterance_id, bool success) { |
| if (success) |
| return; |
| |
| // Since OnSpeakFinished could run asynchronously, it is possible that the |
| // current utterance has changed. Ignore any such spurious callbacks. |
| if (!current_utterance_ || current_utterance_->GetId() != utterance_id) |
| return; |
| |
| // If the native voice wasn't able to process this speech, see if the browser |
| // has built-in TTS that crashed and needs re-loading or the utterance came |
| // from a profile that no longer exists e.g. login. |
| // The controller only ends up here if we had at some point completely |
| // initialized native tts and tts engine delegate (see SpeakOrEnqueue), so |
| // drop the utterance from re-processing. |
| GetTtsPlatform()->LoadBuiltInTtsEngine( |
| current_utterance_->GetBrowserContext()); |
| |
| current_utterance_->OnTtsEvent(TTS_EVENT_ERROR, kInvalidCharIndex, |
| kInvalidLength, GetTtsPlatform()->GetError()); |
| SetCurrentUtterance(nullptr); |
| } |
| |
| void TtsControllerImpl::ClearUtteranceQueue(bool send_events) { |
| while (!utterance_list_.empty()) { |
| std::unique_ptr<TtsUtterance> utterance = |
| std::move(utterance_list_.front()); |
| utterance_list_.pop_front(); |
| if (send_events) { |
| utterance->OnTtsEvent(TTS_EVENT_CANCELLED, kInvalidCharIndex, |
| kInvalidLength, std::string()); |
| } else { |
| utterance->Finish(); |
| } |
| } |
| } |
| |
| void TtsControllerImpl::FinishCurrentUtterance() { |
| if (!current_utterance_) |
| return; |
| |
| if (!current_utterance_->IsFinished()) { |
| current_utterance_->OnTtsEvent(TTS_EVENT_INTERRUPTED, kInvalidCharIndex, |
| kInvalidLength, std::string()); |
| } |
| |
| SetCurrentUtterance(nullptr); |
| } |
| |
| void TtsControllerImpl::SpeakNextUtterance() { |
| if (paused_) |
| return; |
| |
| // Start speaking the next utterance in the queue. Keep trying in case |
| // one fails but there are still more in the queue to try. |
| TtsUtterance* previous_utterance = nullptr; |
| while (!utterance_list_.empty() && !current_utterance_) { |
| std::unique_ptr<TtsUtterance> utterance = |
| std::move(utterance_list_.front()); |
| utterance_list_.pop_front(); |
| DCHECK(previous_utterance != utterance.get()); |
| |
| if (ShouldSpeakUtterance(utterance.get())) |
| SpeakNow(std::move(utterance)); |
| else |
| utterance->Finish(); |
| |
| previous_utterance = utterance.get(); |
| } |
| } |
| |
| void TtsControllerImpl::UpdateUtteranceDefaults(TtsUtterance* utterance) { |
| double rate = utterance->GetContinuousParameters().rate; |
| double pitch = utterance->GetContinuousParameters().pitch; |
| double volume = utterance->GetContinuousParameters().volume; |
| #if BUILDFLAG(IS_CHROMEOS) |
| if (GetTtsControllerDelegate()) |
| GetTtsControllerDelegate()->UpdateUtteranceDefaultsFromPrefs( |
| utterance, &rate, &pitch, &volume); |
| #else |
| // Update pitch, rate and volume to defaults if not explicity set on |
| // this utterance. |
| if (rate == blink::mojom::kSpeechSynthesisDoublePrefNotSet) |
| rate = blink::mojom::kSpeechSynthesisDefaultRate; |
| if (pitch == blink::mojom::kSpeechSynthesisDoublePrefNotSet) |
| pitch = blink::mojom::kSpeechSynthesisDefaultPitch; |
| if (volume == blink::mojom::kSpeechSynthesisDoublePrefNotSet) |
| volume = blink::mojom::kSpeechSynthesisDefaultVolume; |
| #endif // BUILDFLAG(IS_CHROMEOS) |
| utterance->SetContinuousParameters(rate, pitch, volume); |
| } |
| |
| void TtsControllerImpl::StripSSML( |
| const std::string& utterance, |
| base::OnceCallback<void(const std::string&)> on_ssml_parsed) { |
| // Skip parsing and return if not xml. |
| if (utterance.find("<?xml") == std::string::npos) { |
| std::move(on_ssml_parsed).Run(utterance); |
| return; |
| } |
| |
| // Parse using safe, out-of-process Xml Parser. |
| data_decoder::DataDecoder::ParseXmlIsolated( |
| utterance, |
| data_decoder::mojom::XmlParser::WhitespaceBehavior::kPreserveSignificant, |
| base::BindOnce(&TtsControllerImpl::StripSSMLHelper, utterance, |
| std::move(on_ssml_parsed))); |
| } |
| |
| // Called when ParseXml finishes. |
| // Uses parsed xml to build parsed utterance text. |
| void TtsControllerImpl::StripSSMLHelper( |
| const std::string& utterance, |
| base::OnceCallback<void(const std::string&)> on_ssml_parsed, |
| data_decoder::DataDecoder::ValueOrError result) { |
| // Error checks. |
| // If invalid xml, return original utterance text. |
| if (!result.has_value()) { |
| std::move(on_ssml_parsed).Run(utterance); |
| return; |
| } |
| |
| std::string root_tag_name; |
| data_decoder::GetXmlElementTagName(*result, &root_tag_name); |
| // Root element must be <speak>. |
| if (root_tag_name.compare("speak") != 0) { |
| std::move(on_ssml_parsed).Run(utterance); |
| return; |
| } |
| |
| std::string parsed_text; |
| // Change from unique_ptr to base::Value* so recursion will work. |
| PopulateParsedText(&parsed_text, &*result); |
| |
| // Run with parsed_text. |
| std::move(on_ssml_parsed).Run(parsed_text); |
| } |
| |
| void TtsControllerImpl::PopulateParsedText(std::string* parsed_text, |
| const base::Value* element) { |
| DCHECK(parsed_text); |
| if (!element || !element->is_dict()) { |
| return; |
| } |
| // Add element's text if present. |
| // Note: We don't use data_decoder::GetXmlElementText because it gets the text |
| // of element's first child, not text of current element. |
| const std::string* text_value = |
| element->GetDict().FindString(data_decoder::mojom::XmlParser::kTextKey); |
| if (text_value) |
| *parsed_text += *text_value; |
| |
| const base::Value::List* children = |
| data_decoder::GetXmlElementChildren(*element); |
| if (!children) { |
| return; |
| } |
| |
| for (const auto& entry : *children) { |
| // We need to iterate over all children because some text elements are |
| // nested within other types of elements, such as <emphasis> tags. |
| PopulateParsedText(parsed_text, &entry); |
| } |
| } |
| |
| int TtsControllerImpl::GetMatchingVoice(TtsUtterance* utterance, |
| const std::vector<VoiceData>& voices) { |
| const std::string app_lang = |
| GetContentClient()->browser()->GetApplicationLocale(); |
| // Start with a best score of -1, that way even if none of the criteria |
| // match, something will be returned if there are any voices. |
| int best_score = -1; |
| int best_score_index = -1; |
| #if BUILDFLAG(IS_CHROMEOS) |
| TtsControllerDelegate* delegate = GetTtsControllerDelegate(); |
| std::unique_ptr<TtsControllerDelegate::PreferredVoiceIds> preferred_ids = |
| delegate ? delegate->GetPreferredVoiceIdsForUtterance(utterance) |
| : nullptr; |
| #endif // BUILDFLAG(IS_CHROMEOS) |
| for (size_t i = 0; i < voices.size(); ++i) { |
| const content::VoiceData& voice = voices[i]; |
| int score = 0; |
| |
| // If the extension ID is specified, check for an exact match. |
| if (!utterance->GetEngineId().empty() && |
| utterance->GetEngineId() != voice.engine_id) |
| continue; |
| |
| // If the voice name is specified, check for an exact match. |
| if (!utterance->GetVoiceName().empty() && |
| voice.name != utterance->GetVoiceName()) |
| continue; |
| |
| // Prefer the utterance language. |
| if (!voice.lang.empty() && !utterance->GetLang().empty()) { |
| std::string voice_language = |
| base::ToLowerASCII(l10n_util::GetLanguage(voice.lang)); |
| std::string voice_country = |
| base::ToLowerASCII(l10n_util::GetCountry(voice.lang)); |
| std::string utterance_language = |
| base::ToLowerASCII(l10n_util::GetLanguage(utterance->GetLang())); |
| std::string utterance_country = |
| base::ToLowerASCII(l10n_util::GetCountry(utterance->GetLang())); |
| |
| // An exact locale match is worth more than a partial match. |
| // Convert locales to lowercase to handle cases like "en-us" vs. "en-US". |
| // Cases where language and country match should score the same as an |
| // exact match. |
| if (voice_language == utterance_language && |
| (voice_country == utterance_country || |
| (utterance_country.empty() && voice_language == voice_country) || |
| (voice_country.empty() && |
| utterance_language == utterance_country))) { |
| score += 128; |
| } else if (voice_language == utterance_language) { |
| score += 64; |
| } |
| } |
| |
| // Next, prefer required event types. |
| if (!utterance->GetRequiredEventTypes().empty()) { |
| bool has_all_required_event_types = true; |
| for (TtsEventType event_type : utterance->GetRequiredEventTypes()) { |
| if (voice.events.find(event_type) == voice.events.end()) { |
| has_all_required_event_types = false; |
| break; |
| } |
| } |
| if (has_all_required_event_types) |
| score += 32; |
| } |
| |
| #if BUILDFLAG(IS_CHROMEOS) |
| if (preferred_ids) { |
| // First prefer the user's preference voice for the utterance language, |
| // if the utterance language is specified. |
| if (!utterance->GetLang().empty() && |
| VoiceIdMatches(preferred_ids->lang_voice_id, voice)) { |
| score += 16; |
| } |
| |
| // Then prefer the user's preference voice for the system language. |
| // This is a lower priority match than the utterance voice. |
| if (VoiceIdMatches(preferred_ids->locale_voice_id, voice)) |
| score += 8; |
| |
| // Finally, prefer the user's preference voice for any language. This will |
| // pick the default voice if there is no better match for the current |
| // system language and utterance language. |
| if (VoiceIdMatches(preferred_ids->any_locale_voice_id, voice)) |
| score += 4; |
| } |
| #endif // BUILDFLAG(IS_CHROMEOS) |
| |
| // Finally, prefer system language. |
| if (!voice.lang.empty()) { |
| if (voice.lang == app_lang) { |
| score += 2; |
| } else if (base::EqualsCaseInsensitiveASCII( |
| l10n_util::GetLanguage(voice.lang), |
| l10n_util::GetLanguage(app_lang))) { |
| score += 1; |
| } |
| } |
| |
| if (score > best_score) { |
| best_score = score; |
| best_score_index = i; |
| } |
| } |
| |
| return best_score_index; |
| } |
| |
| void TtsControllerImpl::SetCurrentUtterance( |
| std::unique_ptr<TtsUtterance> utterance) { |
| current_utterance_ = std::move(utterance); |
| Observe(current_utterance_ |
| ? AsUtteranceImpl(current_utterance_.get())->GetWebContents() |
| : nullptr); |
| } |
| |
| void TtsControllerImpl::StopCurrentUtteranceAndRemoveUtterancesMatching( |
| WebContents* wc) { |
| DCHECK(wc); |
| // Removes any utterances that match the WebContents from the current |
| // utterance (which our inherited WebContentsObserver starts observing every |
| // time the utterance changes). |
| // |
| // This is called when the WebContents for the current utterance is destroyed |
| // or hidden. In the case where it's destroyed, this is done to avoid |
| // attempting to start a utterance that is very likely to be destroyed right |
| // away, and there are also subtle timing issues if we didn't do this (if a |
| // queued utterance has already received WebContentsDestroyed(), and we start |
| // it, we won't get the corresponding WebContentsDestroyed()). |
| auto eraser = [wc](const std::unique_ptr<TtsUtterance>& utterance) { |
| TtsUtteranceImpl* utterance_impl = AsUtteranceImpl(utterance.get()); |
| if (utterance_impl->GetWebContents() == wc) { |
| utterance_impl->Finish(); |
| return true; |
| } |
| return false; |
| }; |
| utterance_list_.erase( |
| std::remove_if(utterance_list_.begin(), utterance_list_.end(), eraser), |
| utterance_list_.end()); |
| const bool stopped = StopCurrentUtteranceIfMatches(GURL()); |
| DCHECK(stopped); |
| SpeakNextUtterance(); |
| } |
| |
| void TtsControllerImpl::RemoveUtteranceAndStopIfNeeded(int utterance_id) { |
| for (std::list<std::unique_ptr<TtsUtterance>>::iterator it = |
| utterance_list_.begin(); |
| it != utterance_list_.end(); ++it) { |
| if ((*it)->GetId() == utterance_id) { |
| TtsUtteranceImpl* utterance_impl = AsUtteranceImpl((*it).get()); |
| utterance_impl->Finish(); |
| utterance_list_.erase(it); |
| break; |
| } |
| } |
| |
| const bool stopped = StopCurrentUtteranceIfMatches(utterance_id); |
| if (stopped) |
| SpeakNextUtterance(); |
| } |
| |
| bool TtsControllerImpl::StopCurrentUtteranceIfMatches(int utterance_id) { |
| paused_ = false; |
| |
| if (current_utterance_->GetId() != utterance_id) |
| return false; |
| |
| StopCurrentUtterance(); |
| return true; |
| } |
| |
| bool TtsControllerImpl::ShouldSpeakUtterance(TtsUtterance* utterance) { |
| TtsUtteranceImpl* utterance_impl = AsUtteranceImpl(utterance); |
| if (!utterance_impl->was_created_with_web_contents() || |
| utterance_impl->ShouldAlwaysBeSpoken()) { |
| return true; |
| } |
| |
| // If the WebContents that created the utterance has been destroyed, don't |
| // speak it. |
| if (!utterance_impl->GetWebContents()) |
| return false; |
| |
| // Allow speaking if either the WebContents is visible, or the WebContents |
| // isn't required to be visible before speaking. |
| return !stop_speaking_when_hidden_ || |
| utterance_impl->GetWebContents()->GetVisibility() != |
| Visibility::HIDDEN; |
| } |
| |
| // |
| // WebContentsObserver |
| // |
| |
| void TtsControllerImpl::WebContentsDestroyed() { |
| StopCurrentUtteranceAndRemoveUtterancesMatching(web_contents()); |
| } |
| |
| void TtsControllerImpl::OnVisibilityChanged(Visibility visibility) { |
| if (visibility == Visibility::HIDDEN && stop_speaking_when_hidden_) |
| StopCurrentUtteranceAndRemoveUtterancesMatching(web_contents()); |
| } |
| |
| void TtsControllerImpl::OnNetworkChanged( |
| net::NetworkChangeNotifier::ConnectionType type) { |
| switch (type) { |
| // Non-cellular connections. |
| case net::NetworkChangeNotifier::ConnectionType::CONNECTION_UNKNOWN: |
| case net::NetworkChangeNotifier::ConnectionType::CONNECTION_ETHERNET: |
| case net::NetworkChangeNotifier::ConnectionType::CONNECTION_WIFI: |
| case net::NetworkChangeNotifier::ConnectionType::CONNECTION_BLUETOOTH: |
| allow_remote_voices_ = true; |
| break; |
| |
| // Cellular connections. |
| case net::NetworkChangeNotifier::ConnectionType::CONNECTION_2G: |
| case net::NetworkChangeNotifier::ConnectionType::CONNECTION_3G: |
| case net::NetworkChangeNotifier::ConnectionType::CONNECTION_4G: |
| case net::NetworkChangeNotifier::ConnectionType::CONNECTION_NONE: |
| case net::NetworkChangeNotifier::ConnectionType::CONNECTION_5G: |
| allow_remote_voices_ = false; |
| } |
| } |
| |
| #if BUILDFLAG(IS_CHROMEOS) |
| TtsControllerDelegate* TtsControllerImpl::GetTtsControllerDelegate() { |
| if (delegate_) |
| return delegate_; |
| if (GetContentClient() && GetContentClient()->browser()) { |
| delegate_ = GetContentClient()->browser()->GetTtsControllerDelegate(); |
| return delegate_; |
| } |
| return nullptr; |
| } |
| |
| void TtsControllerImpl::SetTtsControllerDelegateForTesting( |
| TtsControllerDelegate* delegate) { |
| delegate_ = delegate; |
| } |
| #endif // BUILDFLAG(IS_CHROMEOS) |
| |
| } // namespace content |