| // Copyright 2013 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "content/browser/speech/speech_recognition_manager_impl.h" |
| |
| #include <algorithm> |
| #include <map> |
| #include <optional> |
| #include <utility> |
| #include <vector> |
| |
| #include "base/functional/bind.h" |
| #include "base/functional/callback.h" |
| #include "base/location.h" |
| #include "base/memory/ref_counted_delete_on_sequence.h" |
| #include "base/metrics/histogram_functions.h" |
| #include "base/task/sequenced_task_runner.h" |
| #include "base/task/single_thread_task_runner.h" |
| #include "build/build_config.h" |
| #include "components/soda/soda_util.h" |
| #include "content/browser/browser_main_loop.h" |
| #include "content/browser/renderer_host/media/media_stream_manager.h" |
| #include "content/browser/renderer_host/media/media_stream_ui_proxy.h" |
| #include "content/browser/speech/network_speech_recognition_engine_impl.h" |
| #include "content/browser/speech/speech_recognizer_impl.h" |
| #include "content/public/browser/browser_task_traits.h" |
| #include "content/public/browser/browser_thread.h" |
| #include "content/public/browser/content_browser_client.h" |
| #include "content/public/browser/document_user_data.h" |
| #include "content/public/browser/render_frame_host.h" |
| #include "content/public/browser/resource_context.h" |
| #include "content/public/browser/speech_recognition_audio_forwarder_config.h" |
| #include "content/public/browser/speech_recognition_event_listener.h" |
| #include "content/public/browser/speech_recognition_manager_delegate.h" |
| #include "content/public/browser/speech_recognition_session_config.h" |
| #include "content/public/browser/speech_recognition_session_context.h" |
| #include "content/public/browser/web_contents.h" |
| #include "content/public/browser/web_contents_observer.h" |
| #include "content/public/common/content_client.h" |
| #include "media/audio/audio_device_description.h" |
| #include "media/base/limits.h" |
| #include "media/mojo/mojom/speech_recognition.mojom.h" |
| #include "media/mojo/mojom/speech_recognition_audio_forwarder.mojom.h" |
| #include "media/mojo/mojom/speech_recognition_error.mojom.h" |
| #include "media/mojo/mojom/speech_recognition_result.mojom.h" |
| #include "media/mojo/mojom/speech_recognizer.mojom.h" |
| #include "mojo/public/cpp/bindings/pending_receiver.h" |
| #include "url/gurl.h" |
| #include "url/origin.h" |
| |
| #if BUILDFLAG(IS_ANDROID) |
| #include "content/browser/speech/speech_recognizer_impl_android.h" |
| #elif !BUILDFLAG(IS_FUCHSIA) |
| #include "components/soda/constants.h" |
| #include "components/soda/soda_util.h" |
| #include "content/browser/speech/soda_speech_recognition_engine_impl.h" |
| #include "media/base/media_switches.h" |
| #endif // BUILDFLAG(IS_ANDROID) |
| |
| namespace content { |
| |
| SpeechRecognitionManager* SpeechRecognitionManager::manager_for_tests_; |
| |
| namespace { |
| |
| SpeechRecognitionManagerImpl* g_speech_recognition_manager_impl; |
| |
| constexpr char kWebSpeechAudioOnDeviceAvailableHistogram[] = |
| "Accessibility.WebSpeech.OnDeviceAvailable"; |
| constexpr char kWebSpeechAudioUseOnDeviceHistogram[] = |
| "Accessibility.WebSpeech.UseOnDevice"; |
| constexpr char kWebSpeechAudioUseAudioForwarderHistogram[] = |
| "Accessibility.WebSpeech.UseAudioForwarder"; |
| constexpr char kWebSpeechCanRenderFrameUseOnDeviceHistogram[] = |
| "Accessibility.WebSpeech.CanRenderFrameUseOnDevice"; |
| constexpr char kWebSpeechIsOnDeviceSpeechRecognitionInstalledHistogram[] = |
| "Accessibility.WebSpeech.IsOnDeviceSpeechRecognitionInstalled"; |
| |
| } // namespace |
| |
| int SpeechRecognitionManagerImpl::next_requester_id_ = 0; |
| |
| class FrameSessionTracker |
| : public content::DocumentUserData<FrameSessionTracker> { |
| public: |
| using FrameDeletedCallback = |
| base::RepeatingCallback<void(int /* session_id */)>; |
| |
| ~FrameSessionTracker() override { |
| DCHECK_CURRENTLY_ON(BrowserThread::UI); |
| |
| for (auto session : sessions_) { |
| GetIOThreadTaskRunner({})->PostTask( |
| FROM_HERE, base::BindOnce(frame_deleted_callback_, session)); |
| } |
| } |
| |
| static void CreateObserverForSession(int render_process_id, |
| int render_frame_id, |
| int session_id, |
| FrameDeletedCallback callback) { |
| DCHECK_CURRENTLY_ON(BrowserThread::UI); |
| |
| RenderFrameHost* render_frame_host = |
| RenderFrameHost::FromID(render_process_id, render_frame_id); |
| if (!render_frame_host) |
| return; |
| |
| FrameSessionTracker* tracker = |
| GetOrCreateForCurrentDocument(render_frame_host); |
| |
| // This will clobber any previously set callback but it will always |
| // be the same binding. |
| tracker->SetCallback(std::move(callback)); |
| tracker->AddSession(session_id); |
| } |
| |
| static void RemoveObserverForSession(int render_process_id, |
| int render_frame_id, |
| int session_id) { |
| DCHECK_CURRENTLY_ON(BrowserThread::UI); |
| |
| RenderFrameHost* render_frame_host = |
| RenderFrameHost::FromID(render_process_id, render_frame_id); |
| if (!render_frame_host) |
| return; |
| |
| FrameSessionTracker* tracker = GetForCurrentDocument(render_frame_host); |
| if (!tracker) |
| return; |
| tracker->RemoveSession(session_id); |
| } |
| |
| private: |
| explicit FrameSessionTracker(content::RenderFrameHost* rfh) |
| : DocumentUserData<FrameSessionTracker>(rfh) {} |
| |
| friend class content::DocumentUserData<FrameSessionTracker>; |
| DOCUMENT_USER_DATA_KEY_DECL(); |
| |
| void AddSession(int session_id) { sessions_.insert(session_id); } |
| |
| void RemoveSession(int session_id) { sessions_.erase(session_id); } |
| |
| void SetCallback(FrameDeletedCallback callback) { |
| frame_deleted_callback_ = std::move(callback); |
| } |
| |
| FrameDeletedCallback frame_deleted_callback_; |
| std::set<int> sessions_; |
| }; |
| |
| DOCUMENT_USER_DATA_KEY_IMPL(FrameSessionTracker); |
| |
| SpeechRecognitionManager* SpeechRecognitionManager::GetInstance() { |
| if (manager_for_tests_) |
| return manager_for_tests_; |
| return SpeechRecognitionManagerImpl::GetInstance(); |
| } |
| |
| void SpeechRecognitionManager::SetManagerForTesting( |
| SpeechRecognitionManager* manager) { |
| manager_for_tests_ = manager; |
| } |
| |
| SpeechRecognitionManagerImpl* SpeechRecognitionManagerImpl::GetInstance() { |
| return g_speech_recognition_manager_impl; |
| } |
| |
| bool SpeechRecognitionManagerImpl::IsOnDeviceSpeechRecognitionInstalled( |
| const SpeechRecognitionSessionConfig& config) { |
| #if !BUILDFLAG(IS_FUCHSIA) && !BUILDFLAG(IS_ANDROID) |
| return speech::IsOnDeviceSpeechRecognitionAvailable(config.language) == |
| media::mojom::AvailabilityStatus::kAvailable; |
| #else |
| return false; |
| #endif // !BUILDFLAG(IS_FUCHSIA) && !BUILDFLAG(IS_ANDROID) |
| } |
| |
| SpeechRecognitionManagerImpl::SpeechRecognitionManagerImpl( |
| media::AudioSystem* audio_system, |
| MediaStreamManager* media_stream_manager) |
| : audio_system_(audio_system), |
| media_stream_manager_(media_stream_manager), |
| delegate_(GetContentClient() |
| ->browser() |
| ->CreateSpeechRecognitionManagerDelegate()), |
| requester_id_(next_requester_id_++) { |
| DCHECK(!g_speech_recognition_manager_impl); |
| g_speech_recognition_manager_impl = this; |
| } |
| |
| SpeechRecognitionManagerImpl::~SpeechRecognitionManagerImpl() { |
| DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| DCHECK(g_speech_recognition_manager_impl); |
| |
| g_speech_recognition_manager_impl = nullptr; |
| } |
| |
| int SpeechRecognitionManagerImpl::CreateSession( |
| const SpeechRecognitionSessionConfig& config) { |
| return CreateSession(std::move(config), mojo::NullReceiver(), |
| mojo::NullRemote(), std::nullopt); |
| } |
| |
| int SpeechRecognitionManagerImpl::CreateSession( |
| const SpeechRecognitionSessionConfig& config, |
| mojo::PendingReceiver<media::mojom::SpeechRecognitionSession> |
| session_receiver, |
| mojo::PendingRemote<media::mojom::SpeechRecognitionSessionClient> |
| client_remote, |
| std::optional<SpeechRecognitionAudioForwarderConfig> |
| audio_forwarder_config) { |
| return CreateSession( |
| std::move(config), std::move(session_receiver), std::move(client_remote), |
| audio_forwarder_config.has_value() |
| ? std::make_optional<SpeechRecognitionAudioForwarderConfig>( |
| audio_forwarder_config.value()) |
| : std::nullopt, |
| /*can_render_frame_use_on_device=*/ |
| false); // On-device speech recognition may only be used if the callsite |
| // explicitly checks if the render frame is permitted to use it. |
| } |
| |
| void SpeechRecognitionManagerImpl::StartSession(int session_id) { |
| DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| if (!SessionExists(session_id)) |
| return; |
| |
| if (sessions_[session_id]->use_microphone) { |
| // If there is another session using the microphone, abort that. |
| if (microphone_session_id_ != kSessionIDInvalid && |
| microphone_session_id_ != session_id) { |
| AbortSession(microphone_session_id_); |
| } |
| |
| microphone_session_id_ = session_id; |
| |
| if (delegate_) { |
| delegate_->CheckRecognitionIsAllowed( |
| session_id, |
| base::BindOnce( |
| &SpeechRecognitionManagerImpl::RecognitionAllowedCallback, |
| weak_factory_.GetWeakPtr(), session_id)); |
| } |
| return; |
| } |
| |
| base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask( |
| FROM_HERE, |
| base::BindOnce(&SpeechRecognitionManagerImpl::DispatchEvent, |
| weak_factory_.GetWeakPtr(), session_id, EVENT_START)); |
| } |
| |
| void SpeechRecognitionManagerImpl::RecognitionAllowedCallback(int session_id, |
| bool ask_user, |
| bool is_allowed) { |
| DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| |
| auto iter = sessions_.find(session_id); |
| if (iter == sessions_.end()) |
| return; |
| |
| Session* session = iter->second.get(); |
| |
| if (session->abort_requested) |
| return; |
| |
| if (ask_user) { |
| SpeechRecognitionSessionContext& context = session->context; |
| context.label = media_stream_manager_->MakeMediaAccessRequest( |
| {context.render_process_id, context.render_frame_id}, requester_id_, |
| session_id, blink::StreamControls(true, false), context.security_origin, |
| base::BindOnce( |
| &SpeechRecognitionManagerImpl::MediaRequestPermissionCallback, |
| weak_factory_.GetWeakPtr(), session_id)); |
| return; |
| } |
| |
| if (is_allowed) { |
| base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask( |
| FROM_HERE, |
| base::BindOnce(&SpeechRecognitionManagerImpl::DispatchEvent, |
| weak_factory_.GetWeakPtr(), session_id, EVENT_START)); |
| } else { |
| OnRecognitionError( |
| session_id, media::mojom::SpeechRecognitionError( |
| media::mojom::SpeechRecognitionErrorCode::kNotAllowed, |
| media::mojom::SpeechAudioErrorDetails::kNone)); |
| base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask( |
| FROM_HERE, |
| base::BindOnce(&SpeechRecognitionManagerImpl::DispatchEvent, |
| weak_factory_.GetWeakPtr(), session_id, EVENT_ABORT)); |
| } |
| } |
| |
| void SpeechRecognitionManagerImpl::MediaRequestPermissionCallback( |
| int session_id, |
| const blink::mojom::StreamDevicesSet& stream_devices_set, |
| std::unique_ptr<MediaStreamUIProxy> stream_ui) { |
| DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| |
| auto iter = sessions_.find(session_id); |
| if (iter == sessions_.end()) |
| return; |
| |
| // The SpeechRecognictionManager is not used with multiple streams |
| // which is only supported in combination with the getAllScreensMedia API. |
| // The |stream_devices| vector can be empty e.g. if the permission |
| // was denied. |
| DCHECK_LE(stream_devices_set.stream_devices.size(), 1u); |
| |
| blink::MediaStreamDevices devices_list = |
| blink::ToMediaStreamDevicesList(stream_devices_set); |
| const bool is_allowed = !devices_list.empty(); |
| if (is_allowed) { |
| // Copy the approved devices array to the context for UI indication. |
| iter->second->context.devices = devices_list; |
| |
| // Save the UI object. |
| iter->second->ui = std::move(stream_ui); |
| } |
| |
| // Clear the label to indicate the request has been done. |
| iter->second->context.label.clear(); |
| |
| // Notify the recognition about the request result. |
| RecognitionAllowedCallback(iter->first, false, is_allowed); |
| } |
| |
| void SpeechRecognitionManagerImpl::AbortSession(int session_id) { |
| DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| auto iter = sessions_.find(session_id); |
| if (iter == sessions_.end()) |
| return; |
| |
| GetUIThreadTaskRunner({})->PostTask( |
| FROM_HERE, |
| base::BindOnce(&FrameSessionTracker::RemoveObserverForSession, |
| iter->second->config.initial_context.render_process_id, |
| iter->second->config.initial_context.render_frame_id, |
| session_id)); |
| |
| AbortSessionImpl(session_id); |
| } |
| |
| void SpeechRecognitionManagerImpl::AbortSessionImpl(int session_id) { |
| DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| |
| auto iter = sessions_.find(session_id); |
| if (iter == sessions_.end()) |
| return; |
| |
| iter->second->ui.reset(); |
| |
| if (iter->second->abort_requested) |
| return; |
| |
| iter->second->abort_requested = true; |
| |
| base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask( |
| FROM_HERE, |
| base::BindOnce(&SpeechRecognitionManagerImpl::DispatchEvent, |
| weak_factory_.GetWeakPtr(), session_id, EVENT_ABORT)); |
| } |
| |
| void SpeechRecognitionManagerImpl::StopAudioCaptureForSession(int session_id) { |
| DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| |
| auto iter = sessions_.find(session_id); |
| if (iter == sessions_.end()) |
| return; |
| |
| GetUIThreadTaskRunner({})->PostTask( |
| FROM_HERE, |
| base::BindOnce(&FrameSessionTracker::RemoveObserverForSession, |
| iter->second->config.initial_context.render_process_id, |
| iter->second->config.initial_context.render_frame_id, |
| session_id)); |
| |
| iter->second->ui.reset(); |
| |
| base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask( |
| FROM_HERE, base::BindOnce(&SpeechRecognitionManagerImpl::DispatchEvent, |
| weak_factory_.GetWeakPtr(), session_id, |
| EVENT_STOP_CAPTURE)); |
| } |
| |
| void SpeechRecognitionManagerImpl::UpdateRecognitionContextForSession( |
| int session_id, |
| const media::SpeechRecognitionRecognitionContext& recognition_context) { |
| CHECK_CURRENTLY_ON(BrowserThread::IO); |
| auto iter = sessions_.find(session_id); |
| if (iter == sessions_.end()) { |
| return; |
| } |
| iter->second->recognition_context = recognition_context; |
| |
| base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask( |
| FROM_HERE, base::BindOnce(&SpeechRecognitionManagerImpl::DispatchEvent, |
| weak_factory_.GetWeakPtr(), session_id, |
| EVENT_UPDATE_RECOGNITION_CONTEXT)); |
| } |
| |
| // Here begins the SpeechRecognitionEventListener interface implementation, |
| // which will simply relay the events to the proper listener registered for the |
| // particular session and to the catch-all listener provided by the delegate |
| // (if any). |
| |
| void SpeechRecognitionManagerImpl::OnRecognitionStart(int session_id) { |
| DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| if (!SessionExists(session_id)) |
| return; |
| |
| auto iter = sessions_.find(session_id); |
| if (iter->second->ui) { |
| // Notify the UI that the devices are being used. |
| iter->second->ui->OnStarted( |
| base::OnceClosure(), MediaStreamUI::SourceCallback(), |
| MediaStreamUIProxy::WindowIdCallback(), /*label=*/std::string(), |
| /*screen_capture_ids=*/{}, MediaStreamUI::StateChangeCallback()); |
| } |
| |
| if (SpeechRecognitionEventListener* delegate_listener = GetDelegateListener()) |
| delegate_listener->OnRecognitionStart(session_id); |
| if (SpeechRecognitionEventListener* listener = GetListener(session_id)) |
| listener->OnRecognitionStart(session_id); |
| } |
| |
| void SpeechRecognitionManagerImpl::OnAudioStart(int session_id) { |
| DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| if (!SessionExists(session_id)) |
| return; |
| |
| if (SpeechRecognitionEventListener* delegate_listener = GetDelegateListener()) |
| delegate_listener->OnAudioStart(session_id); |
| if (SpeechRecognitionEventListener* listener = GetListener(session_id)) |
| listener->OnAudioStart(session_id); |
| } |
| |
| void SpeechRecognitionManagerImpl::OnSoundStart(int session_id) { |
| DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| if (!SessionExists(session_id)) |
| return; |
| |
| if (SpeechRecognitionEventListener* delegate_listener = GetDelegateListener()) |
| delegate_listener->OnSoundStart(session_id); |
| if (SpeechRecognitionEventListener* listener = GetListener(session_id)) |
| listener->OnSoundStart(session_id); |
| } |
| |
| void SpeechRecognitionManagerImpl::OnSoundEnd(int session_id) { |
| DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| if (!SessionExists(session_id)) |
| return; |
| |
| if (SpeechRecognitionEventListener* delegate_listener = GetDelegateListener()) |
| delegate_listener->OnSoundEnd(session_id); |
| if (SpeechRecognitionEventListener* listener = GetListener(session_id)) |
| listener->OnSoundEnd(session_id); |
| } |
| |
| void SpeechRecognitionManagerImpl::OnAudioEnd(int session_id) { |
| DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| if (!SessionExists(session_id)) |
| return; |
| |
| if (SpeechRecognitionEventListener* delegate_listener = GetDelegateListener()) |
| delegate_listener->OnAudioEnd(session_id); |
| if (SpeechRecognitionEventListener* listener = GetListener(session_id)) |
| listener->OnAudioEnd(session_id); |
| base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask( |
| FROM_HERE, base::BindOnce(&SpeechRecognitionManagerImpl::DispatchEvent, |
| weak_factory_.GetWeakPtr(), session_id, |
| EVENT_AUDIO_ENDED)); |
| } |
| |
| void SpeechRecognitionManagerImpl::OnRecognitionResults( |
| int session_id, |
| const std::vector<media::mojom::WebSpeechRecognitionResultPtr>& results) { |
| DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| if (!SessionExists(session_id)) |
| return; |
| |
| if (SpeechRecognitionEventListener* delegate_listener = GetDelegateListener()) |
| delegate_listener->OnRecognitionResults(session_id, results); |
| if (SpeechRecognitionEventListener* listener = GetListener(session_id)) |
| listener->OnRecognitionResults(session_id, results); |
| } |
| |
| void SpeechRecognitionManagerImpl::OnRecognitionError( |
| int session_id, |
| const media::mojom::SpeechRecognitionError& error) { |
| DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| if (!SessionExists(session_id)) |
| return; |
| |
| if (SpeechRecognitionEventListener* delegate_listener = GetDelegateListener()) |
| delegate_listener->OnRecognitionError(session_id, error); |
| if (SpeechRecognitionEventListener* listener = GetListener(session_id)) |
| listener->OnRecognitionError(session_id, error); |
| } |
| |
| void SpeechRecognitionManagerImpl::OnAudioLevelsChange( |
| int session_id, float volume, float noise_volume) { |
| DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| if (!SessionExists(session_id)) |
| return; |
| |
| if (SpeechRecognitionEventListener* delegate_listener = GetDelegateListener()) |
| delegate_listener->OnAudioLevelsChange(session_id, volume, noise_volume); |
| if (SpeechRecognitionEventListener* listener = GetListener(session_id)) |
| listener->OnAudioLevelsChange(session_id, volume, noise_volume); |
| } |
| |
| int SpeechRecognitionManagerImpl::CreateSession( |
| const SpeechRecognitionSessionConfig& config, |
| mojo::PendingReceiver<media::mojom::SpeechRecognitionSession> |
| session_receiver, |
| mojo::PendingRemote<media::mojom::SpeechRecognitionSessionClient> |
| client_remote, |
| std::optional<SpeechRecognitionAudioForwarderConfig> audio_forwarder_config, |
| bool can_render_frame_use_on_device) { |
| DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| const int session_id = GetNextSessionID(); |
| DCHECK(!SessionExists(session_id)); |
| |
| base::UmaHistogramBoolean(kWebSpeechAudioOnDeviceAvailableHistogram, |
| IsOnDeviceSpeechRecognitionInstalled(config)); |
| base::UmaHistogramBoolean(kWebSpeechAudioUseOnDeviceHistogram, |
| UseOnDeviceSpeechRecognition(config)); |
| base::UmaHistogramBoolean(kWebSpeechAudioUseAudioForwarderHistogram, |
| audio_forwarder_config.has_value()); |
| |
| // Initialize the error to be none. |
| media::mojom::SpeechRecognitionErrorCode error = |
| media::mojom::SpeechRecognitionErrorCode::kNone; |
| |
| if (UseOnDeviceSpeechRecognition(config)) { |
| base::UmaHistogramBoolean(kWebSpeechCanRenderFrameUseOnDeviceHistogram, |
| can_render_frame_use_on_device); |
| if (!can_render_frame_use_on_device) { |
| error = media::mojom::SpeechRecognitionErrorCode::kServiceNotAllowed; |
| } |
| |
| bool is_on_device_speech_recognition_installed = |
| IsOnDeviceSpeechRecognitionInstalled(config); |
| base::UmaHistogramBoolean( |
| kWebSpeechIsOnDeviceSpeechRecognitionInstalledHistogram, |
| is_on_device_speech_recognition_installed); |
| // Set the error if on-device speech recognition must be used but is not |
| // available. |
| if (!is_on_device_speech_recognition_installed) { |
| error = media::mojom::SpeechRecognitionErrorCode::kLanguageNotSupported; |
| } |
| } else { |
| // Set the error if on-device speech recognition is not used but recognition |
| // context is set. |
| if (config.recognition_context.has_value()) { |
| error = media::mojom::SpeechRecognitionErrorCode::kPhrasesNotSupported; |
| } |
| } |
| |
| if (audio_forwarder_config.has_value() && |
| (audio_forwarder_config.value().sample_rate > |
| media::limits::kMaxSampleRate || |
| audio_forwarder_config.value().sample_rate < |
| media::limits::kMinSampleRate || |
| audio_forwarder_config.value().channel_count <= 0 || |
| audio_forwarder_config.value().channel_count > |
| media::limits::kMaxChannels)) { |
| error = media::mojom::SpeechRecognitionErrorCode::kAudioCapture; |
| } |
| |
| // Throw the error and do not create the session if error is found. |
| if (error != media::mojom::SpeechRecognitionErrorCode::kNone) { |
| mojo::Remote<media::mojom::SpeechRecognitionSessionClient> client( |
| std::move(client_remote)); |
| if (client.is_bound()) { |
| client->ErrorOccurred(media::mojom::SpeechRecognitionError::New( |
| error, media::mojom::SpeechAudioErrorDetails::kNone)); |
| client->Ended(); |
| } else if (config.event_listener) { |
| // The client may have been moved into the event_listener such as what |
| // SpeechRecognitionDispatcherHost does, so throw the error there. |
| config.event_listener.get()->OnRecognitionError( |
| session_id, media::mojom::SpeechRecognitionError( |
| error, media::mojom::SpeechAudioErrorDetails::kNone)); |
| config.event_listener.get()->OnRecognitionEnd(session_id); |
| } else { |
| // At least a client should be have been informed of the error. |
| NOTREACHED(); |
| } |
| return session_id; |
| } |
| |
| // Set-up the new session. |
| auto session = std::make_unique<Session>(); |
| session->id = session_id; |
| session->config = config; |
| session->context = config.initial_context; |
| session->use_microphone = !audio_forwarder_config.has_value(); |
| |
| #if !BUILDFLAG(IS_ANDROID) |
| #if !BUILDFLAG(IS_FUCHSIA) |
| if (UseOnDeviceSpeechRecognition(config) && |
| audio_forwarder_config.has_value()) { |
| CHECK_GT(audio_forwarder_config.value().channel_count, 0); |
| CHECK_GT(audio_forwarder_config.value().sample_rate, 0); |
| // The speech recognition service process will create and manage the speech |
| // recognition session instead of the browser. Raw audio will be passed |
| // directly to the speech recognition process and speech recognition events |
| // will be returned directly to the renderer, bypassing the browser |
| // entirely. |
| if (!speech_recognition_context_.is_bound()) { |
| raw_ptr<SpeechRecognitionManagerDelegate> |
| speech_recognition_mgr_delegate = |
| SpeechRecognitionManagerImpl::GetInstance() |
| ? SpeechRecognitionManagerImpl::GetInstance()->delegate() |
| : nullptr; |
| |
| CHECK(speech_recognition_mgr_delegate); |
| mojo::PendingReceiver<media::mojom::SpeechRecognitionContext> |
| speech_recognition_context_receiver = |
| speech_recognition_context_.BindNewPipeAndPassReceiver(); |
| speech_recognition_mgr_delegate->BindSpeechRecognitionContext( |
| std::move(speech_recognition_context_receiver), config.language); |
| } |
| |
| media::mojom::SpeechRecognitionOptionsPtr options = |
| media::mojom::SpeechRecognitionOptions::New(); |
| options->recognition_mode = media::mojom::SpeechRecognitionMode::kCaption; |
| options->enable_formatting = false; |
| options->recognizer_client_type = |
| media::mojom::RecognizerClientType::kLiveCaption; |
| options->skip_continuously_empty_audio = true; |
| options->recognition_context = config.recognition_context; |
| |
| speech_recognition_context_->BindWebSpeechRecognizer( |
| std::move(session_receiver), std::move(client_remote), |
| std::move(audio_forwarder_config.value().audio_forwarder), |
| audio_forwarder_config.value().channel_count, |
| audio_forwarder_config.value().sample_rate, std::move(options), |
| config.continuous); |
| |
| // The session is managed by the speech recognition service directly thus |
| // does not need to be associated with a session id in the browser. |
| return 0; |
| } |
| #endif //! BUILDFLAG(IS_FUCHSIA) |
| |
| std::unique_ptr<SpeechRecognitionEngine> speech_recognition_engine; |
| |
| #if !BUILDFLAG(IS_FUCHSIA) |
| if (UseOnDeviceSpeechRecognition(config)) { |
| std::unique_ptr<SodaSpeechRecognitionEngineImpl> |
| soda_speech_recognition_engine = |
| std::make_unique<SodaSpeechRecognitionEngineImpl>(config); |
| if (soda_speech_recognition_engine->Initialize()) { |
| speech_recognition_engine = std::move(soda_speech_recognition_engine); |
| } |
| } |
| #endif //! BUILDFLAG(IS_FUCHSIA) |
| |
| if (!speech_recognition_engine) { |
| // A NetworkSpeechRecognitionEngineImpl (and corresponding Config) is |
| // required only when using SpeechRecognizerImpl, which performs the audio |
| // capture and endpointing in the browser. This is not the case of Android |
| // where, not only the speech recognition, but also the audio capture and |
| // endpointing activities performed outside of the browser (delegated via |
| // JNI to the Android API implementation). |
| |
| NetworkSpeechRecognitionEngineImpl::Config remote_engine_config; |
| remote_engine_config.language = config.language; |
| remote_engine_config.grammars = config.grammars; |
| remote_engine_config.audio_sample_rate = |
| audio_forwarder_config.has_value() |
| ? audio_forwarder_config.value().sample_rate |
| : SpeechRecognizerImpl::kAudioSampleRate; |
| remote_engine_config.audio_num_bits_per_sample = |
| SpeechRecognizerImpl::kNumBitsPerAudioSample; |
| remote_engine_config.filter_profanities = config.filter_profanities; |
| remote_engine_config.continuous = config.continuous; |
| remote_engine_config.interim_results = config.interim_results; |
| remote_engine_config.max_hypotheses = config.max_hypotheses; |
| remote_engine_config.origin_url = config.origin.Serialize(); |
| remote_engine_config.auth_token = config.auth_token; |
| remote_engine_config.auth_scope = config.auth_scope; |
| remote_engine_config.preamble = config.preamble; |
| |
| std::unique_ptr<NetworkSpeechRecognitionEngineImpl> google_remote_engine = |
| std::make_unique<NetworkSpeechRecognitionEngineImpl>( |
| config.shared_url_loader_factory); |
| google_remote_engine->SetConfig(remote_engine_config); |
| speech_recognition_engine = std::move(google_remote_engine); |
| } |
| |
| session->recognizer = new SpeechRecognizerImpl( |
| this, audio_system_, session_id, config.continuous, |
| config.interim_results, std::move(speech_recognition_engine), |
| audio_forwarder_config.has_value() |
| ? std::make_optional<SpeechRecognitionAudioForwarderConfig>( |
| audio_forwarder_config.value()) |
| : std::nullopt); |
| |
| #else |
| session->recognizer = new SpeechRecognizerImplAndroid(this, session_id); |
| #endif //! BUILDFLAG(IS_ANDROID) |
| |
| sessions_[session_id] = std::move(session); |
| |
| GetUIThreadTaskRunner({})->PostTask( |
| FROM_HERE, |
| base::BindOnce( |
| &FrameSessionTracker::CreateObserverForSession, |
| config.initial_context.render_process_id, |
| config.initial_context.render_frame_id, session_id, |
| base::BindRepeating(&SpeechRecognitionManagerImpl::AbortSessionImpl, |
| weak_factory_.GetWeakPtr()))); |
| |
| return session_id; |
| } |
| |
| void SpeechRecognitionManagerImpl::OnRecognitionEnd(int session_id) { |
| DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| if (!SessionExists(session_id)) |
| return; |
| |
| if (SpeechRecognitionEventListener* delegate_listener = GetDelegateListener()) |
| delegate_listener->OnRecognitionEnd(session_id); |
| if (SpeechRecognitionEventListener* listener = GetListener(session_id)) |
| listener->OnRecognitionEnd(session_id); |
| base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask( |
| FROM_HERE, base::BindOnce(&SpeechRecognitionManagerImpl::DispatchEvent, |
| weak_factory_.GetWeakPtr(), session_id, |
| EVENT_RECOGNITION_ENDED)); |
| } |
| |
| SpeechRecognitionSessionContext SpeechRecognitionManagerImpl::GetSessionContext( |
| int session_id) { |
| return GetSession(session_id)->context; |
| } |
| |
| bool SpeechRecognitionManagerImpl::UseOnDeviceSpeechRecognition( |
| const SpeechRecognitionSessionConfig& config) { |
| #if !BUILDFLAG(IS_FUCHSIA) && !BUILDFLAG(IS_ANDROID) |
| return config.on_device && |
| (speech::IsOnDeviceSpeechRecognitionAvailable(config.language) == |
| media::mojom::AvailabilityStatus::kAvailable || |
| !config.allow_cloud_fallback); |
| #else |
| return false; |
| #endif |
| } |
| |
| void SpeechRecognitionManagerImpl::AbortAllSessionsForRenderFrame( |
| int render_process_id, |
| int render_frame_id) { |
| DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| |
| for (const auto& session_pair : sessions_) { |
| Session* session = session_pair.second.get(); |
| if (session->context.render_process_id == render_process_id && |
| session->context.render_frame_id == render_frame_id) { |
| AbortSession(session->id); |
| } |
| } |
| } |
| |
| // ----------------------- Core FSM implementation --------------------------- |
| void SpeechRecognitionManagerImpl::DispatchEvent(int session_id, |
| FSMEvent event) { |
| DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| |
| // There are some corner cases in which the session might be deleted (due to |
| // an EndRecognition event) between a request (e.g. Abort) and its dispatch. |
| if (!SessionExists(session_id)) |
| return; |
| |
| Session* session = GetSession(session_id); |
| FSMState session_state = GetSessionState(session_id); |
| DCHECK_LE(session_state, SESSION_STATE_MAX_VALUE); |
| DCHECK_LE(event, EVENT_MAX_VALUE); |
| |
| // Event dispatching must be sequential, otherwise it will break all the rules |
| // and the assumptions of the finite state automata model. |
| DCHECK(!is_dispatching_event_); |
| is_dispatching_event_ = true; |
| ExecuteTransitionAndGetNextState(session, session_state, event); |
| is_dispatching_event_ = false; |
| } |
| |
| // This FSM handles the evolution of each session, from the viewpoint of the |
| // interaction with the user (that may be either the browser end-user which |
| // interacts with UI bubbles, or JS developer interacting with JS methods). |
| // All the events received by the SpeechRecognizer instances (one for each |
| // session) are always routed to the SpeechRecognitionEventListener(s) |
| // regardless the choices taken in this FSM. |
| void SpeechRecognitionManagerImpl::ExecuteTransitionAndGetNextState( |
| Session* session, FSMState session_state, FSMEvent event) { |
| // Note: since we're not tracking the state of the recognizer object, rather |
| // we're directly retrieving it (through GetSessionState), we see its events |
| // (that are AUDIO_ENDED and RECOGNITION_ENDED) after its state evolution |
| // (e.g., when we receive the AUDIO_ENDED event, the recognizer has just |
| // completed the transition from CAPTURING_AUDIO to WAITING_FOR_RESULT, thus |
| // we perceive the AUDIO_ENDED event in WAITING_FOR_RESULT). |
| // This makes the code below a bit tricky but avoids a lot of code for |
| // tracking and reconstructing asynchronously the state of the recognizer. |
| switch (session_state) { |
| case SESSION_STATE_IDLE: |
| switch (event) { |
| case EVENT_START: |
| return SessionStart(*session); |
| case EVENT_UPDATE_RECOGNITION_CONTEXT: |
| return SessionUpdateRecognitionContext(*session); |
| case EVENT_ABORT: |
| return SessionAbort(*session); |
| case EVENT_RECOGNITION_ENDED: |
| return SessionDelete(session); |
| case EVENT_STOP_CAPTURE: |
| return SessionStopAudioCapture(*session); |
| case EVENT_AUDIO_ENDED: |
| return; |
| } |
| break; |
| case SESSION_STATE_CAPTURING_AUDIO: |
| switch (event) { |
| case EVENT_UPDATE_RECOGNITION_CONTEXT: |
| return SessionUpdateRecognitionContext(*session); |
| case EVENT_STOP_CAPTURE: |
| return SessionStopAudioCapture(*session); |
| case EVENT_ABORT: |
| return SessionAbort(*session); |
| case EVENT_START: |
| return; |
| case EVENT_AUDIO_ENDED: |
| case EVENT_RECOGNITION_ENDED: |
| return NotFeasible(*session, event); |
| } |
| break; |
| case SESSION_STATE_WAITING_FOR_RESULT: |
| switch (event) { |
| case EVENT_UPDATE_RECOGNITION_CONTEXT: |
| return SessionUpdateRecognitionContext(*session); |
| case EVENT_ABORT: |
| return SessionAbort(*session); |
| case EVENT_AUDIO_ENDED: |
| return ResetCapturingSessionId(*session); |
| case EVENT_START: |
| case EVENT_STOP_CAPTURE: |
| return; |
| case EVENT_RECOGNITION_ENDED: |
| return NotFeasible(*session, event); |
| } |
| break; |
| } |
| return NotFeasible(*session, event); |
| } |
| |
| SpeechRecognitionManagerImpl::FSMState |
| SpeechRecognitionManagerImpl::GetSessionState(int session_id) const { |
| Session* session = GetSession(session_id); |
| if (!session->recognizer.get() || !session->recognizer->IsActive()) |
| return SESSION_STATE_IDLE; |
| if (session->recognizer->IsCapturingAudio()) |
| return SESSION_STATE_CAPTURING_AUDIO; |
| return SESSION_STATE_WAITING_FOR_RESULT; |
| } |
| |
| // ----------- Contract for all the FSM evolution functions below ------------- |
| // - Are guaranteed to be executed in the IO thread; |
| // - Are guaranteed to be not reentrant (themselves and each other); |
| |
| void SpeechRecognitionManagerImpl::SessionStart(const Session& session) { |
| const blink::MediaStreamDevices& devices = session.context.devices; |
| std::string device_id; |
| if (devices.empty()) { |
| // From the ask_user=false path, use the default device. |
| // TODO(xians): Abort the session after we do not need to support this path |
| // anymore. |
| device_id = media::AudioDeviceDescription::kDefaultDeviceId; |
| } else { |
| // From the ask_user=true path, use the selected device. |
| DCHECK_EQ(1u, devices.size()); |
| DCHECK_EQ(blink::mojom::MediaStreamType::DEVICE_AUDIO_CAPTURE, |
| devices.front().type); |
| device_id = devices.front().id; |
| } |
| |
| session.recognizer->StartRecognition(device_id); |
| } |
| |
| void SpeechRecognitionManagerImpl::SessionUpdateRecognitionContext( |
| const Session& session) { |
| CHECK(session.recognizer.get()); |
| session.recognizer->UpdateRecognitionContext(session.recognition_context); |
| } |
| |
| void SpeechRecognitionManagerImpl::SessionAbort(const Session& session) { |
| if (microphone_session_id_ == session.id) { |
| microphone_session_id_ = kSessionIDInvalid; |
| } |
| DCHECK(session.recognizer.get()); |
| session.recognizer->AbortRecognition(); |
| } |
| |
| void SpeechRecognitionManagerImpl::SessionStopAudioCapture( |
| const Session& session) { |
| DCHECK(session.recognizer.get()); |
| session.recognizer->StopAudioCapture(); |
| } |
| |
| void SpeechRecognitionManagerImpl::ResetCapturingSessionId( |
| const Session& session) { |
| microphone_session_id_ = kSessionIDInvalid; |
| } |
| |
| void SpeechRecognitionManagerImpl::SessionDelete(Session* session) { |
| DCHECK(session->recognizer.get() == nullptr || |
| !session->recognizer->IsActive()); |
| if (microphone_session_id_ == session->id) { |
| microphone_session_id_ = kSessionIDInvalid; |
| } |
| if (!session->context.label.empty()) |
| media_stream_manager_->CancelRequest(session->context.label); |
| sessions_.erase(session->id); |
| } |
| |
| void SpeechRecognitionManagerImpl::NotFeasible(const Session& session, |
| FSMEvent event) { |
| NOTREACHED() << "Unfeasible event " << event << " in state " |
| << GetSessionState(session.id) << " for session " << session.id; |
| } |
| |
| int SpeechRecognitionManagerImpl::GetNextSessionID() { |
| ++last_session_id_; |
| // Deal with wrapping of last_session_id_. (How civilized). |
| if (last_session_id_ <= 0) |
| last_session_id_ = 1; |
| return last_session_id_; |
| } |
| |
| bool SpeechRecognitionManagerImpl::SessionExists(int session_id) const { |
| return sessions_.find(session_id) != sessions_.end(); |
| } |
| |
| SpeechRecognitionManagerImpl::Session* |
| SpeechRecognitionManagerImpl::GetSession(int session_id) const { |
| DCHECK_CURRENTLY_ON(BrowserThread::IO); |
| auto iter = sessions_.find(session_id); |
| CHECK(iter != sessions_.end()); |
| return iter->second.get(); |
| } |
| |
| SpeechRecognitionEventListener* SpeechRecognitionManagerImpl::GetListener( |
| int session_id) const { |
| Session* session = GetSession(session_id); |
| if (session->config.event_listener) |
| return session->config.event_listener.get(); |
| return nullptr; |
| } |
| |
| SpeechRecognitionEventListener* |
| SpeechRecognitionManagerImpl::GetDelegateListener() const { |
| return delegate_.get() ? delegate_->GetEventListener() : nullptr; |
| } |
| |
| const SpeechRecognitionSessionConfig& |
| SpeechRecognitionManagerImpl::GetSessionConfig(int session_id) { |
| return GetSession(session_id)->config; |
| } |
| |
| SpeechRecognitionManagerImpl::Session::Session() |
| : id(kSessionIDInvalid), abort_requested(false) {} |
| |
| SpeechRecognitionManagerImpl::Session::~Session() { |
| } |
| |
| } // namespace content |