blob: 31dd4a84dd01a5183fa80eeb677772ab447c6a33 [file] [log] [blame]
Avi Drissman4e1b7bc32022-09-15 14:03:501// Copyright 2013 The Chromium Authors
[email protected]3bc0b562011-08-24 23:51:042// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
[email protected]c52b2892012-03-07 11:01:025#include "content/browser/speech/speech_recognition_manager_impl.h"
[email protected]3bc0b562011-08-24 23:51:046
Avi Drissman6ed86dc32018-03-03 00:29:247#include <algorithm>
8#include <map>
Evan Liu881ab7a2024-08-01 21:54:519#include <optional>
dcheng36b6aec92015-12-26 06:16:3610#include <utility>
Guido Urdaneta971548c2018-12-13 17:25:2211#include <vector>
dcheng36b6aec92015-12-26 06:16:3612
Avi Drissmanadac21992023-01-11 23:46:3913#include "base/functional/bind.h"
14#include "base/functional/callback.h"
skyostil95082a62015-06-05 19:53:0715#include "base/location.h"
Avi Drissman6ed86dc32018-03-03 00:29:2416#include "base/memory/ref_counted_delete_on_sequence.h"
Evan Liu045091f2024-11-20 22:24:4117#include "base/metrics/histogram_functions.h"
Patrick Monette643cdf62021-10-15 19:13:4218#include "base/task/sequenced_task_runner.h"
19#include "base/task/single_thread_task_runner.h"
avib7348942015-12-25 20:57:1020#include "build/build_config.h"
Evan Liub2b7f6d2024-08-05 22:55:4821#include "components/soda/soda_util.h"
[email protected]52e456b92012-02-23 17:13:1822#include "content/browser/browser_main_loop.h"
[email protected]2af35c502012-09-13 20:14:4323#include "content/browser/renderer_host/media/media_stream_manager.h"
[email protected]2b94cfe2013-06-07 13:12:5324#include "content/browser/renderer_host/media/media_stream_ui_proxy.h"
Yaowei Zhouf7df39c2024-02-29 04:53:4825#include "content/browser/speech/network_speech_recognition_engine_impl.h"
[email protected]ce1adc342013-05-20 13:35:4326#include "content/browser/speech/speech_recognizer_impl.h"
Eric Seckler8652dcd52018-09-20 10:42:2827#include "content/public/browser/browser_task_traits.h"
Matt Menke7b2266e2018-06-07 19:32:0928#include "content/public/browser/browser_thread.h"
[email protected]66cfec62012-02-24 17:57:5129#include "content/public/browser/content_browser_client.h"
Sreeja Kamishetty0a0961f2021-10-11 16:23:5330#include "content/public/browser/document_user_data.h"
Avi Drissman6ed86dc32018-03-03 00:29:2431#include "content/public/browser/render_frame_host.h"
[email protected]ce967862012-02-09 22:47:0532#include "content/public/browser/resource_context.h"
Evan Liu881ab7a2024-08-01 21:54:5133#include "content/public/browser/speech_recognition_audio_forwarder_config.h"
[email protected]b450e902012-04-25 20:20:1834#include "content/public/browser/speech_recognition_event_listener.h"
[email protected]c52b2892012-03-07 11:01:0235#include "content/public/browser/speech_recognition_manager_delegate.h"
[email protected]b450e902012-04-25 20:20:1836#include "content/public/browser/speech_recognition_session_config.h"
37#include "content/public/browser/speech_recognition_session_context.h"
Avi Drissman6ed86dc32018-03-03 00:29:2438#include "content/public/browser/web_contents.h"
39#include "content/public/browser/web_contents_observer.h"
Hans Wennborg5ffd1392019-10-16 11:00:0240#include "content/public/common/content_client.h"
olka251dd5692016-04-27 15:50:1741#include "media/audio/audio_device_description.h"
Evan Liud422a372025-07-09 23:38:1642#include "media/base/limits.h"
Evan Liu881ab7a2024-08-01 21:54:5143#include "media/mojo/mojom/speech_recognition.mojom.h"
44#include "media/mojo/mojom/speech_recognition_audio_forwarder.mojom.h"
Evan Liud7252282024-05-16 20:10:3245#include "media/mojo/mojom/speech_recognition_error.mojom.h"
46#include "media/mojo/mojom/speech_recognition_result.mojom.h"
Evan Liu881ab7a2024-08-01 21:54:5147#include "media/mojo/mojom/speech_recognizer.mojom.h"
48#include "mojo/public/cpp/bindings/pending_receiver.h"
guidouc6dd4462016-04-30 18:00:3749#include "url/gurl.h"
50#include "url/origin.h"
[email protected]3bc0b562011-08-24 23:51:0451
Xiaohan Wang8d67feb2022-01-15 14:37:4352#if BUILDFLAG(IS_ANDROID)
[email protected]6c590042013-06-14 12:23:2653#include "content/browser/speech/speech_recognizer_impl_android.h"
Evan Liu45289592024-03-21 17:46:1254#elif !BUILDFLAG(IS_FUCHSIA)
55#include "components/soda/constants.h"
Evan Liu45289592024-03-21 17:46:1256#include "components/soda/soda_util.h"
57#include "content/browser/speech/soda_speech_recognition_engine_impl.h"
58#include "media/base/media_switches.h"
Evan Liu45289592024-03-21 17:46:1259#endif // BUILDFLAG(IS_ANDROID)
[email protected]6c590042013-06-14 12:23:2660
[email protected]b450e902012-04-25 20:20:1861namespace content {
[email protected]66cfec62012-02-24 17:57:5162
[email protected]f3b279e2013-01-29 20:48:4663SpeechRecognitionManager* SpeechRecognitionManager::manager_for_tests_;
64
[email protected]28df14d2012-05-16 14:51:2265namespace {
[email protected]fcb8e0212012-10-29 11:57:1866
67SpeechRecognitionManagerImpl* g_speech_recognition_manager_impl;
[email protected]a52749172012-06-14 10:28:2668
Evan Liu045091f2024-11-20 22:24:4169constexpr char kWebSpeechAudioOnDeviceAvailableHistogram[] =
70 "Accessibility.WebSpeech.OnDeviceAvailable";
71constexpr char kWebSpeechAudioUseOnDeviceHistogram[] =
72 "Accessibility.WebSpeech.UseOnDevice";
73constexpr char kWebSpeechAudioUseAudioForwarderHistogram[] =
74 "Accessibility.WebSpeech.UseAudioForwarder";
Evan Liu1353e8e2025-06-04 17:13:5675constexpr char kWebSpeechCanRenderFrameUseOnDeviceHistogram[] =
76 "Accessibility.WebSpeech.CanRenderFrameUseOnDevice";
77constexpr char kWebSpeechIsOnDeviceSpeechRecognitionInstalledHistogram[] =
78 "Accessibility.WebSpeech.IsOnDeviceSpeechRecognitionInstalled";
Evan Liu045091f2024-11-20 22:24:4179
[email protected]28df14d2012-05-16 14:51:2280} // namespace
81
Guido Urdaneta971548c2018-12-13 17:25:2282int SpeechRecognitionManagerImpl::next_requester_id_ = 0;
83
Dave Tapuska54d439892021-08-12 16:43:1784class FrameSessionTracker
Sreeja Kamishetty0a0961f2021-10-11 16:23:5385 : public content::DocumentUserData<FrameSessionTracker> {
Avi Drissman6ed86dc32018-03-03 00:29:2486 public:
87 using FrameDeletedCallback =
88 base::RepeatingCallback<void(int /* session_id */)>;
Avi Drissman6ed86dc32018-03-03 00:29:2489
Dave Tapuska54d439892021-08-12 16:43:1790 ~FrameSessionTracker() override {
91 DCHECK_CURRENTLY_ON(BrowserThread::UI);
Avi Drissman6ed86dc32018-03-03 00:29:2492
Dave Tapuska54d439892021-08-12 16:43:1793 for (auto session : sessions_) {
94 GetIOThreadTaskRunner({})->PostTask(
95 FROM_HERE, base::BindOnce(frame_deleted_callback_, session));
96 }
Avi Drissman6ed86dc32018-03-03 00:29:2497 }
98
Dave Tapuska54d439892021-08-12 16:43:1799 static void CreateObserverForSession(int render_process_id,
100 int render_frame_id,
101 int session_id,
102 FrameDeletedCallback callback) {
103 DCHECK_CURRENTLY_ON(BrowserThread::UI);
Avi Drissman6ed86dc32018-03-03 00:29:24104
Dave Tapuska54d439892021-08-12 16:43:17105 RenderFrameHost* render_frame_host =
106 RenderFrameHost::FromID(render_process_id, render_frame_id);
107 if (!render_frame_host)
108 return;
109
110 FrameSessionTracker* tracker =
111 GetOrCreateForCurrentDocument(render_frame_host);
112
113 // This will clobber any previously set callback but it will always
114 // be the same binding.
115 tracker->SetCallback(std::move(callback));
116 tracker->AddSession(session_id);
117 }
118
119 static void RemoveObserverForSession(int render_process_id,
120 int render_frame_id,
121 int session_id) {
122 DCHECK_CURRENTLY_ON(BrowserThread::UI);
123
124 RenderFrameHost* render_frame_host =
125 RenderFrameHost::FromID(render_process_id, render_frame_id);
126 if (!render_frame_host)
127 return;
128
129 FrameSessionTracker* tracker = GetForCurrentDocument(render_frame_host);
130 if (!tracker)
131 return;
132 tracker->RemoveSession(session_id);
133 }
134
135 private:
Lukasz Anforowiczcfeb95c2021-10-01 19:33:35136 explicit FrameSessionTracker(content::RenderFrameHost* rfh)
Sreeja Kamishetty0a0961f2021-10-11 16:23:53137 : DocumentUserData<FrameSessionTracker>(rfh) {}
Dave Tapuska54d439892021-08-12 16:43:17138
Sreeja Kamishetty0a0961f2021-10-11 16:23:53139 friend class content::DocumentUserData<FrameSessionTracker>;
140 DOCUMENT_USER_DATA_KEY_DECL();
Dave Tapuska54d439892021-08-12 16:43:17141
142 void AddSession(int session_id) { sessions_.insert(session_id); }
143
144 void RemoveSession(int session_id) { sessions_.erase(session_id); }
145
146 void SetCallback(FrameDeletedCallback callback) {
147 frame_deleted_callback_ = std::move(callback);
148 }
149
150 FrameDeletedCallback frame_deleted_callback_;
151 std::set<int> sessions_;
152};
153
Sreeja Kamishetty0a0961f2021-10-11 16:23:53154DOCUMENT_USER_DATA_KEY_IMPL(FrameSessionTracker);
Avi Drissman6ed86dc32018-03-03 00:29:24155
[email protected]fcb8e0212012-10-29 11:57:18156SpeechRecognitionManager* SpeechRecognitionManager::GetInstance() {
[email protected]f3b279e2013-01-29 20:48:46157 if (manager_for_tests_)
158 return manager_for_tests_;
[email protected]fcb8e0212012-10-29 11:57:18159 return SpeechRecognitionManagerImpl::GetInstance();
160}
[email protected]3bc0b562011-08-24 23:51:04161
[email protected]0a208bf2013-10-01 21:09:54162void SpeechRecognitionManager::SetManagerForTesting(
[email protected]f3b279e2013-01-29 20:48:46163 SpeechRecognitionManager* manager) {
164 manager_for_tests_ = manager;
165}
166
[email protected]c52b2892012-03-07 11:01:02167SpeechRecognitionManagerImpl* SpeechRecognitionManagerImpl::GetInstance() {
[email protected]28df14d2012-05-16 14:51:22168 return g_speech_recognition_manager_impl;
[email protected]3bc0b562011-08-24 23:51:04169}
170
Evan Liu68919fc2025-03-06 20:38:18171bool SpeechRecognitionManagerImpl::IsOnDeviceSpeechRecognitionInstalled(
Evan Liu45289592024-03-21 17:46:12172 const SpeechRecognitionSessionConfig& config) {
Evan Liu881ab7a2024-08-01 21:54:51173#if !BUILDFLAG(IS_FUCHSIA) && !BUILDFLAG(IS_ANDROID)
Evan Liu68919fc2025-03-06 20:38:18174 return speech::IsOnDeviceSpeechRecognitionAvailable(config.language) ==
175 media::mojom::AvailabilityStatus::kAvailable;
Evan Liu881ab7a2024-08-01 21:54:51176#else
177 return false;
Evan Liu45289592024-03-21 17:46:12178#endif // !BUILDFLAG(IS_FUCHSIA) && !BUILDFLAG(IS_ANDROID)
Evan Liu881ab7a2024-08-01 21:54:51179}
Evan Liu45289592024-03-21 17:46:12180
[email protected]aa445462013-06-21 17:12:36181SpeechRecognitionManagerImpl::SpeechRecognitionManagerImpl(
olkaef762c92017-02-06 16:45:16182 media::AudioSystem* audio_system,
olka087876b2017-01-27 12:50:12183 MediaStreamManager* media_stream_manager)
olkaef762c92017-02-06 16:45:16184 : audio_system_(audio_system),
185 media_stream_manager_(media_stream_manager),
olka087876b2017-01-27 12:50:12186 delegate_(GetContentClient()
187 ->browser()
188 ->CreateSpeechRecognitionManagerDelegate()),
Jeremy Roman3bca4bf2019-07-11 03:41:25189 requester_id_(next_requester_id_++) {
[email protected]28df14d2012-05-16 14:51:22190 DCHECK(!g_speech_recognition_manager_impl);
191 g_speech_recognition_manager_impl = this;
[email protected]66cfec62012-02-24 17:57:51192}
193
[email protected]c52b2892012-03-07 11:01:02194SpeechRecognitionManagerImpl::~SpeechRecognitionManagerImpl() {
Kevin Marshallf7882372017-06-06 00:14:34195 DCHECK_CURRENTLY_ON(BrowserThread::IO);
[email protected]28df14d2012-05-16 14:51:22196 DCHECK(g_speech_recognition_manager_impl);
Kevin Marshallf7882372017-06-06 00:14:34197
Ivan Kotenkov2c0d2bb32017-11-01 15:41:28198 g_speech_recognition_manager_impl = nullptr;
[email protected]0944a7292011-09-21 16:45:06199}
200
[email protected]b450e902012-04-25 20:20:18201int SpeechRecognitionManagerImpl::CreateSession(
[email protected]12f4fb92012-05-16 10:30:16202 const SpeechRecognitionSessionConfig& config) {
Evan Liu881ab7a2024-08-01 21:54:51203 return CreateSession(std::move(config), mojo::NullReceiver(),
204 mojo::NullRemote(), std::nullopt);
205}
[email protected]b450e902012-04-25 20:20:18206
Evan Liu881ab7a2024-08-01 21:54:51207int SpeechRecognitionManagerImpl::CreateSession(
208 const SpeechRecognitionSessionConfig& config,
209 mojo::PendingReceiver<media::mojom::SpeechRecognitionSession>
210 session_receiver,
211 mojo::PendingRemote<media::mojom::SpeechRecognitionSessionClient>
212 client_remote,
213 std::optional<SpeechRecognitionAudioForwarderConfig>
214 audio_forwarder_config) {
Evan Liu79284922025-05-29 20:29:10215 return CreateSession(
216 std::move(config), std::move(session_receiver), std::move(client_remote),
Evan Liu881ab7a2024-08-01 21:54:51217 audio_forwarder_config.has_value()
218 ? std::make_optional<SpeechRecognitionAudioForwarderConfig>(
219 audio_forwarder_config.value())
Evan Liu79284922025-05-29 20:29:10220 : std::nullopt,
221 /*can_render_frame_use_on_device=*/
222 false); // On-device speech recognition may only be used if the callsite
223 // explicitly checks if the render frame is permitted to use it.
[email protected]b450e902012-04-25 20:20:18224}
225
226void SpeechRecognitionManagerImpl::StartSession(int session_id) {
mostynb042582e2015-03-16 22:13:40227 DCHECK_CURRENTLY_ON(BrowserThread::IO);
[email protected]12f4fb92012-05-16 10:30:16228 if (!SessionExists(session_id))
229 return;
[email protected]b450e902012-04-25 20:20:18230
Evan Liu1c0164d2025-02-12 20:40:36231 if (sessions_[session_id]->use_microphone) {
232 // If there is another session using the microphone, abort that.
233 if (microphone_session_id_ != kSessionIDInvalid &&
234 microphone_session_id_ != session_id) {
235 AbortSession(microphone_session_id_);
236 }
237
238 microphone_session_id_ = session_id;
239
240 if (delegate_) {
241 delegate_->CheckRecognitionIsAllowed(
242 session_id,
243 base::BindOnce(
244 &SpeechRecognitionManagerImpl::RecognitionAllowedCallback,
245 weak_factory_.GetWeakPtr(), session_id));
246 }
247 return;
[email protected]b450e902012-04-25 20:20:18248 }
249
Evan Liu1c0164d2025-02-12 20:40:36250 base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask(
251 FROM_HERE,
252 base::BindOnce(&SpeechRecognitionManagerImpl::DispatchEvent,
253 weak_factory_.GetWeakPtr(), session_id, EVENT_START));
[email protected]b450e902012-04-25 20:20:18254}
255
256void SpeechRecognitionManagerImpl::RecognitionAllowedCallback(int session_id,
[email protected]2af35c502012-09-13 20:14:43257 bool ask_user,
[email protected]b450e902012-04-25 20:20:18258 bool is_allowed) {
mostynb042582e2015-03-16 22:13:40259 DCHECK_CURRENTLY_ON(BrowserThread::IO);
[email protected]2af35c502012-09-13 20:14:43260
Avi Drissmane6d65502018-02-13 16:21:58261 auto iter = sessions_.find(session_id);
Avi Drissman6ed86dc32018-03-03 00:29:24262 if (iter == sessions_.end())
263 return;
264
Avi Drissmane6d65502018-02-13 16:21:58265 Session* session = iter->second.get();
[email protected]6df4ab92013-10-02 19:22:28266
267 if (session->abort_requested)
268 return;
269
[email protected]2af35c502012-09-13 20:14:43270 if (ask_user) {
[email protected]6df4ab92013-10-02 19:22:28271 SpeechRecognitionSessionContext& context = session->context;
[email protected]aa445462013-06-21 17:12:36272 context.label = media_stream_manager_->MakeMediaAccessRequest(
Bryant Chandlerabd520a2023-10-30 17:47:35273 {context.render_process_id, context.render_frame_id}, requester_id_,
Scott Violetb166d202021-01-27 22:01:11274 session_id, blink::StreamControls(true, false), context.security_origin,
Mark Pilgrim35434202017-07-14 19:43:24275 base::BindOnce(
[email protected]aa445462013-06-21 17:12:36276 &SpeechRecognitionManagerImpl::MediaRequestPermissionCallback,
277 weak_factory_.GetWeakPtr(), session_id));
[email protected]2af35c502012-09-13 20:14:43278 return;
279 }
[email protected]2af35c502012-09-13 20:14:43280
[email protected]b450e902012-04-25 20:20:18281 if (is_allowed) {
Sean Maher5b9af51f2022-11-21 15:32:47282 base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask(
[email protected]dd32b1272013-05-04 14:17:11283 FROM_HERE,
tzike2aca992017-09-05 08:50:54284 base::BindOnce(&SpeechRecognitionManagerImpl::DispatchEvent,
285 weak_factory_.GetWeakPtr(), session_id, EVENT_START));
[email protected]b450e902012-04-25 20:20:18286 } else {
Adithya Srinivasanc35bf3962018-06-12 14:28:14287 OnRecognitionError(
Evan Liud7252282024-05-16 20:10:32288 session_id, media::mojom::SpeechRecognitionError(
289 media::mojom::SpeechRecognitionErrorCode::kNotAllowed,
290 media::mojom::SpeechAudioErrorDetails::kNone));
Sean Maher5b9af51f2022-11-21 15:32:47291 base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask(
[email protected]dd32b1272013-05-04 14:17:11292 FROM_HERE,
tzike2aca992017-09-05 08:50:54293 base::BindOnce(&SpeechRecognitionManagerImpl::DispatchEvent,
294 weak_factory_.GetWeakPtr(), session_id, EVENT_ABORT));
[email protected]b450e902012-04-25 20:20:18295 }
296}
297
[email protected]920cfb42012-11-21 17:26:08298void SpeechRecognitionManagerImpl::MediaRequestPermissionCallback(
[email protected]2b94cfe2013-06-07 13:12:53299 int session_id,
Simon Hangl1131b4a2022-05-25 10:25:19300 const blink::mojom::StreamDevicesSet& stream_devices_set,
dcheng59716272016-04-09 05:19:08301 std::unique_ptr<MediaStreamUIProxy> stream_ui) {
mostynb042582e2015-03-16 22:13:40302 DCHECK_CURRENTLY_ON(BrowserThread::IO);
[email protected]920cfb42012-11-21 17:26:08303
Avi Drissmane6d65502018-02-13 16:21:58304 auto iter = sessions_.find(session_id);
[email protected]2b94cfe2013-06-07 13:12:53305 if (iter == sessions_.end())
306 return;
[email protected]920cfb42012-11-21 17:26:08307
Simon Hangl1131b4a2022-05-25 10:25:19308 // The SpeechRecognictionManager is not used with multiple streams
Simon Hanglf6f93452023-05-09 17:11:53309 // which is only supported in combination with the getAllScreensMedia API.
Simon Hangl4474b6eb2022-06-28 23:27:36310 // The |stream_devices| vector can be empty e.g. if the permission
311 // was denied.
312 DCHECK_LE(stream_devices_set.stream_devices.size(), 1u);
Simon Hanglfd5379972022-06-09 09:36:54313
Simon Hangl574cecf2022-05-05 06:19:46314 blink::MediaStreamDevices devices_list =
Simon Hanglfd5379972022-06-09 09:36:54315 blink::ToMediaStreamDevicesList(stream_devices_set);
Simon Hangl574cecf2022-05-05 06:19:46316 const bool is_allowed = !devices_list.empty();
[email protected]2b94cfe2013-06-07 13:12:53317 if (is_allowed) {
318 // Copy the approved devices array to the context for UI indication.
Simon Hangl574cecf2022-05-05 06:19:46319 iter->second->context.devices = devices_list;
[email protected]2b94cfe2013-06-07 13:12:53320
321 // Save the UI object.
dcheng36b6aec92015-12-26 06:16:36322 iter->second->ui = std::move(stream_ui);
[email protected]920cfb42012-11-21 17:26:08323 }
[email protected]2b94cfe2013-06-07 13:12:53324
325 // Clear the label to indicate the request has been done.
326 iter->second->context.label.clear();
327
328 // Notify the recognition about the request result.
329 RecognitionAllowedCallback(iter->first, false, is_allowed);
[email protected]920cfb42012-11-21 17:26:08330}
331
[email protected]b450e902012-04-25 20:20:18332void SpeechRecognitionManagerImpl::AbortSession(int session_id) {
mostynb042582e2015-03-16 22:13:40333 DCHECK_CURRENTLY_ON(BrowserThread::IO);
Avi Drissman6ed86dc32018-03-03 00:29:24334 auto iter = sessions_.find(session_id);
335 if (iter == sessions_.end())
[email protected]12f4fb92012-05-16 10:30:16336 return;
[email protected]b450e902012-04-25 20:20:18337
Gabriel Charettee7cdc5cd2020-05-27 23:35:05338 GetUIThreadTaskRunner({})->PostTask(
339 FROM_HERE,
Dave Tapuska54d439892021-08-12 16:43:17340 base::BindOnce(&FrameSessionTracker::RemoveObserverForSession,
Gabriel Charettee7cdc5cd2020-05-27 23:35:05341 iter->second->config.initial_context.render_process_id,
342 iter->second->config.initial_context.render_frame_id,
343 session_id));
Avi Drissman6ed86dc32018-03-03 00:29:24344
345 AbortSessionImpl(session_id);
346}
347
348void SpeechRecognitionManagerImpl::AbortSessionImpl(int session_id) {
349 DCHECK_CURRENTLY_ON(BrowserThread::IO);
350
Avi Drissmane6d65502018-02-13 16:21:58351 auto iter = sessions_.find(session_id);
Avi Drissman6ed86dc32018-03-03 00:29:24352 if (iter == sessions_.end())
353 return;
354
[email protected]2b94cfe2013-06-07 13:12:53355 iter->second->ui.reset();
[email protected]2af35c502012-09-13 20:14:43356
[email protected]6df4ab92013-10-02 19:22:28357 if (iter->second->abort_requested)
358 return;
359
360 iter->second->abort_requested = true;
361
Sean Maher5b9af51f2022-11-21 15:32:47362 base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask(
[email protected]dd32b1272013-05-04 14:17:11363 FROM_HERE,
tzike2aca992017-09-05 08:50:54364 base::BindOnce(&SpeechRecognitionManagerImpl::DispatchEvent,
365 weak_factory_.GetWeakPtr(), session_id, EVENT_ABORT));
[email protected]b450e902012-04-25 20:20:18366}
367
368void SpeechRecognitionManagerImpl::StopAudioCaptureForSession(int session_id) {
mostynb042582e2015-03-16 22:13:40369 DCHECK_CURRENTLY_ON(BrowserThread::IO);
[email protected]b450e902012-04-25 20:20:18370
Avi Drissmane6d65502018-02-13 16:21:58371 auto iter = sessions_.find(session_id);
Avi Drissman6ed86dc32018-03-03 00:29:24372 if (iter == sessions_.end())
373 return;
374
Avi Drissman2c707cb2020-09-23 14:57:39375 GetUIThreadTaskRunner({})->PostTask(
376 FROM_HERE,
Dave Tapuska54d439892021-08-12 16:43:17377 base::BindOnce(&FrameSessionTracker::RemoveObserverForSession,
Avi Drissman2c707cb2020-09-23 14:57:39378 iter->second->config.initial_context.render_process_id,
379 iter->second->config.initial_context.render_frame_id,
380 session_id));
381
[email protected]2b94cfe2013-06-07 13:12:53382 iter->second->ui.reset();
[email protected]2af35c502012-09-13 20:14:43383
Sean Maher5b9af51f2022-11-21 15:32:47384 base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask(
tzike2aca992017-09-05 08:50:54385 FROM_HERE, base::BindOnce(&SpeechRecognitionManagerImpl::DispatchEvent,
386 weak_factory_.GetWeakPtr(), session_id,
387 EVENT_STOP_CAPTURE));
[email protected]b450e902012-04-25 20:20:18388}
389
Yiren Wang23998562025-01-28 21:31:05390void SpeechRecognitionManagerImpl::UpdateRecognitionContextForSession(
391 int session_id,
392 const media::SpeechRecognitionRecognitionContext& recognition_context) {
393 CHECK_CURRENTLY_ON(BrowserThread::IO);
394 auto iter = sessions_.find(session_id);
395 if (iter == sessions_.end()) {
396 return;
397 }
398 iter->second->recognition_context = recognition_context;
399
400 base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask(
401 FROM_HERE, base::BindOnce(&SpeechRecognitionManagerImpl::DispatchEvent,
402 weak_factory_.GetWeakPtr(), session_id,
403 EVENT_UPDATE_RECOGNITION_CONTEXT));
404}
405
[email protected]b450e902012-04-25 20:20:18406// Here begins the SpeechRecognitionEventListener interface implementation,
407// which will simply relay the events to the proper listener registered for the
[email protected]25bed9242014-05-02 22:05:41408// particular session and to the catch-all listener provided by the delegate
409// (if any).
[email protected]b450e902012-04-25 20:20:18410
411void SpeechRecognitionManagerImpl::OnRecognitionStart(int session_id) {
mostynb042582e2015-03-16 22:13:40412 DCHECK_CURRENTLY_ON(BrowserThread::IO);
[email protected]b450e902012-04-25 20:20:18413 if (!SessionExists(session_id))
414 return;
415
Avi Drissmane6d65502018-02-13 16:21:58416 auto iter = sessions_.find(session_id);
[email protected]2b94cfe2013-06-07 13:12:53417 if (iter->second->ui) {
418 // Notify the UI that the devices are being used.
Sergey Poromovcbe6f7f2020-11-05 14:51:26419 iter->second->ui->OnStarted(
420 base::OnceClosure(), MediaStreamUI::SourceCallback(),
421 MediaStreamUIProxy::WindowIdCallback(), /*label=*/std::string(),
Scott Violetb166d202021-01-27 22:01:11422 /*screen_capture_ids=*/{}, MediaStreamUI::StateChangeCallback());
[email protected]920cfb42012-11-21 17:26:08423 }
[email protected]920cfb42012-11-21 17:26:08424
[email protected]12f4fb92012-05-16 10:30:16425 if (SpeechRecognitionEventListener* delegate_listener = GetDelegateListener())
426 delegate_listener->OnRecognitionStart(session_id);
427 if (SpeechRecognitionEventListener* listener = GetListener(session_id))
428 listener->OnRecognitionStart(session_id);
[email protected]b450e902012-04-25 20:20:18429}
430
431void SpeechRecognitionManagerImpl::OnAudioStart(int session_id) {
mostynb042582e2015-03-16 22:13:40432 DCHECK_CURRENTLY_ON(BrowserThread::IO);
[email protected]b450e902012-04-25 20:20:18433 if (!SessionExists(session_id))
434 return;
435
[email protected]12f4fb92012-05-16 10:30:16436 if (SpeechRecognitionEventListener* delegate_listener = GetDelegateListener())
437 delegate_listener->OnAudioStart(session_id);
438 if (SpeechRecognitionEventListener* listener = GetListener(session_id))
439 listener->OnAudioStart(session_id);
[email protected]b450e902012-04-25 20:20:18440}
441
[email protected]b450e902012-04-25 20:20:18442void SpeechRecognitionManagerImpl::OnSoundStart(int session_id) {
mostynb042582e2015-03-16 22:13:40443 DCHECK_CURRENTLY_ON(BrowserThread::IO);
[email protected]b450e902012-04-25 20:20:18444 if (!SessionExists(session_id))
445 return;
446
[email protected]12f4fb92012-05-16 10:30:16447 if (SpeechRecognitionEventListener* delegate_listener = GetDelegateListener())
448 delegate_listener->OnSoundStart(session_id);
449 if (SpeechRecognitionEventListener* listener = GetListener(session_id))
450 listener->OnSoundStart(session_id);
[email protected]b450e902012-04-25 20:20:18451}
452
453void SpeechRecognitionManagerImpl::OnSoundEnd(int session_id) {
mostynb042582e2015-03-16 22:13:40454 DCHECK_CURRENTLY_ON(BrowserThread::IO);
[email protected]b450e902012-04-25 20:20:18455 if (!SessionExists(session_id))
456 return;
457
[email protected]12f4fb92012-05-16 10:30:16458 if (SpeechRecognitionEventListener* delegate_listener = GetDelegateListener())
459 delegate_listener->OnSoundEnd(session_id);
460 if (SpeechRecognitionEventListener* listener = GetListener(session_id))
461 listener->OnSoundEnd(session_id);
[email protected]b450e902012-04-25 20:20:18462}
463
464void SpeechRecognitionManagerImpl::OnAudioEnd(int session_id) {
mostynb042582e2015-03-16 22:13:40465 DCHECK_CURRENTLY_ON(BrowserThread::IO);
[email protected]b450e902012-04-25 20:20:18466 if (!SessionExists(session_id))
467 return;
468
[email protected]12f4fb92012-05-16 10:30:16469 if (SpeechRecognitionEventListener* delegate_listener = GetDelegateListener())
470 delegate_listener->OnAudioEnd(session_id);
471 if (SpeechRecognitionEventListener* listener = GetListener(session_id))
472 listener->OnAudioEnd(session_id);
Sean Maher5b9af51f2022-11-21 15:32:47473 base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask(
tzike2aca992017-09-05 08:50:54474 FROM_HERE, base::BindOnce(&SpeechRecognitionManagerImpl::DispatchEvent,
475 weak_factory_.GetWeakPtr(), session_id,
476 EVENT_AUDIO_ENDED));
[email protected]b450e902012-04-25 20:20:18477}
478
[email protected]fc88c1e2012-12-04 09:54:36479void SpeechRecognitionManagerImpl::OnRecognitionResults(
Adithya Srinivasane75e3282018-06-01 15:09:00480 int session_id,
Evan Liud7252282024-05-16 20:10:32481 const std::vector<media::mojom::WebSpeechRecognitionResultPtr>& results) {
mostynb042582e2015-03-16 22:13:40482 DCHECK_CURRENTLY_ON(BrowserThread::IO);
[email protected]b450e902012-04-25 20:20:18483 if (!SessionExists(session_id))
484 return;
485
[email protected]12f4fb92012-05-16 10:30:16486 if (SpeechRecognitionEventListener* delegate_listener = GetDelegateListener())
[email protected]fc88c1e2012-12-04 09:54:36487 delegate_listener->OnRecognitionResults(session_id, results);
[email protected]12f4fb92012-05-16 10:30:16488 if (SpeechRecognitionEventListener* listener = GetListener(session_id))
[email protected]fc88c1e2012-12-04 09:54:36489 listener->OnRecognitionResults(session_id, results);
[email protected]b450e902012-04-25 20:20:18490}
491
492void SpeechRecognitionManagerImpl::OnRecognitionError(
Adithya Srinivasance7495062018-05-28 16:12:40493 int session_id,
Evan Liud7252282024-05-16 20:10:32494 const media::mojom::SpeechRecognitionError& error) {
mostynb042582e2015-03-16 22:13:40495 DCHECK_CURRENTLY_ON(BrowserThread::IO);
[email protected]b450e902012-04-25 20:20:18496 if (!SessionExists(session_id))
497 return;
498
[email protected]12f4fb92012-05-16 10:30:16499 if (SpeechRecognitionEventListener* delegate_listener = GetDelegateListener())
500 delegate_listener->OnRecognitionError(session_id, error);
501 if (SpeechRecognitionEventListener* listener = GetListener(session_id))
502 listener->OnRecognitionError(session_id, error);
[email protected]b450e902012-04-25 20:20:18503}
504
505void SpeechRecognitionManagerImpl::OnAudioLevelsChange(
506 int session_id, float volume, float noise_volume) {
mostynb042582e2015-03-16 22:13:40507 DCHECK_CURRENTLY_ON(BrowserThread::IO);
[email protected]b450e902012-04-25 20:20:18508 if (!SessionExists(session_id))
509 return;
510
[email protected]12f4fb92012-05-16 10:30:16511 if (SpeechRecognitionEventListener* delegate_listener = GetDelegateListener())
512 delegate_listener->OnAudioLevelsChange(session_id, volume, noise_volume);
513 if (SpeechRecognitionEventListener* listener = GetListener(session_id))
514 listener->OnAudioLevelsChange(session_id, volume, noise_volume);
[email protected]b450e902012-04-25 20:20:18515}
516
Evan Liu79284922025-05-29 20:29:10517int SpeechRecognitionManagerImpl::CreateSession(
518 const SpeechRecognitionSessionConfig& config,
519 mojo::PendingReceiver<media::mojom::SpeechRecognitionSession>
520 session_receiver,
521 mojo::PendingRemote<media::mojom::SpeechRecognitionSessionClient>
522 client_remote,
523 std::optional<SpeechRecognitionAudioForwarderConfig> audio_forwarder_config,
524 bool can_render_frame_use_on_device) {
525 DCHECK_CURRENTLY_ON(BrowserThread::IO);
526 const int session_id = GetNextSessionID();
527 DCHECK(!SessionExists(session_id));
528
529 base::UmaHistogramBoolean(kWebSpeechAudioOnDeviceAvailableHistogram,
530 IsOnDeviceSpeechRecognitionInstalled(config));
531 base::UmaHistogramBoolean(kWebSpeechAudioUseOnDeviceHistogram,
532 UseOnDeviceSpeechRecognition(config));
533 base::UmaHistogramBoolean(kWebSpeechAudioUseAudioForwarderHistogram,
534 audio_forwarder_config.has_value());
535
536 // Initialize the error to be none.
537 media::mojom::SpeechRecognitionErrorCode error =
538 media::mojom::SpeechRecognitionErrorCode::kNone;
539
540 if (UseOnDeviceSpeechRecognition(config)) {
Evan Liu1353e8e2025-06-04 17:13:56541 base::UmaHistogramBoolean(kWebSpeechCanRenderFrameUseOnDeviceHistogram,
542 can_render_frame_use_on_device);
Evan Liu79284922025-05-29 20:29:10543 if (!can_render_frame_use_on_device) {
544 error = media::mojom::SpeechRecognitionErrorCode::kServiceNotAllowed;
545 }
546
Evan Liu1353e8e2025-06-04 17:13:56547 bool is_on_device_speech_recognition_installed =
548 IsOnDeviceSpeechRecognitionInstalled(config);
549 base::UmaHistogramBoolean(
550 kWebSpeechIsOnDeviceSpeechRecognitionInstalledHistogram,
551 is_on_device_speech_recognition_installed);
Evan Liu79284922025-05-29 20:29:10552 // Set the error if on-device speech recognition must be used but is not
553 // available.
Evan Liu1353e8e2025-06-04 17:13:56554 if (!is_on_device_speech_recognition_installed) {
Evan Liu79284922025-05-29 20:29:10555 error = media::mojom::SpeechRecognitionErrorCode::kLanguageNotSupported;
556 }
557 } else {
558 // Set the error if on-device speech recognition is not used but recognition
559 // context is set.
560 if (config.recognition_context.has_value()) {
561 error = media::mojom::SpeechRecognitionErrorCode::kPhrasesNotSupported;
562 }
563 }
564
Evan Liud422a372025-07-09 23:38:16565 if (audio_forwarder_config.has_value() &&
566 (audio_forwarder_config.value().sample_rate >
567 media::limits::kMaxSampleRate ||
568 audio_forwarder_config.value().sample_rate <
569 media::limits::kMinSampleRate ||
570 audio_forwarder_config.value().channel_count <= 0 ||
571 audio_forwarder_config.value().channel_count >
572 media::limits::kMaxChannels)) {
573 error = media::mojom::SpeechRecognitionErrorCode::kAudioCapture;
574 }
575
Evan Liu79284922025-05-29 20:29:10576 // Throw the error and do not create the session if error is found.
577 if (error != media::mojom::SpeechRecognitionErrorCode::kNone) {
578 mojo::Remote<media::mojom::SpeechRecognitionSessionClient> client(
579 std::move(client_remote));
580 if (client.is_bound()) {
581 client->ErrorOccurred(media::mojom::SpeechRecognitionError::New(
582 error, media::mojom::SpeechAudioErrorDetails::kNone));
583 client->Ended();
584 } else if (config.event_listener) {
585 // The client may have been moved into the event_listener such as what
586 // SpeechRecognitionDispatcherHost does, so throw the error there.
587 config.event_listener.get()->OnRecognitionError(
588 session_id, media::mojom::SpeechRecognitionError(
589 error, media::mojom::SpeechAudioErrorDetails::kNone));
590 config.event_listener.get()->OnRecognitionEnd(session_id);
591 } else {
592 // At least a client should be have been informed of the error.
593 NOTREACHED();
594 }
595 return session_id;
596 }
597
598 // Set-up the new session.
599 auto session = std::make_unique<Session>();
600 session->id = session_id;
601 session->config = config;
602 session->context = config.initial_context;
603 session->use_microphone = !audio_forwarder_config.has_value();
604
605#if !BUILDFLAG(IS_ANDROID)
606#if !BUILDFLAG(IS_FUCHSIA)
607 if (UseOnDeviceSpeechRecognition(config) &&
608 audio_forwarder_config.has_value()) {
609 CHECK_GT(audio_forwarder_config.value().channel_count, 0);
610 CHECK_GT(audio_forwarder_config.value().sample_rate, 0);
611 // The speech recognition service process will create and manage the speech
612 // recognition session instead of the browser. Raw audio will be passed
613 // directly to the speech recognition process and speech recognition events
614 // will be returned directly to the renderer, bypassing the browser
615 // entirely.
616 if (!speech_recognition_context_.is_bound()) {
617 raw_ptr<SpeechRecognitionManagerDelegate>
618 speech_recognition_mgr_delegate =
619 SpeechRecognitionManagerImpl::GetInstance()
620 ? SpeechRecognitionManagerImpl::GetInstance()->delegate()
621 : nullptr;
622
623 CHECK(speech_recognition_mgr_delegate);
624 mojo::PendingReceiver<media::mojom::SpeechRecognitionContext>
625 speech_recognition_context_receiver =
626 speech_recognition_context_.BindNewPipeAndPassReceiver();
627 speech_recognition_mgr_delegate->BindSpeechRecognitionContext(
Evan Liua89ea522025-07-30 21:23:58628 std::move(speech_recognition_context_receiver), config.language);
Evan Liu79284922025-05-29 20:29:10629 }
630
631 media::mojom::SpeechRecognitionOptionsPtr options =
632 media::mojom::SpeechRecognitionOptions::New();
633 options->recognition_mode = media::mojom::SpeechRecognitionMode::kCaption;
634 options->enable_formatting = false;
635 options->recognizer_client_type =
636 media::mojom::RecognizerClientType::kLiveCaption;
637 options->skip_continuously_empty_audio = true;
638 options->recognition_context = config.recognition_context;
639
640 speech_recognition_context_->BindWebSpeechRecognizer(
641 std::move(session_receiver), std::move(client_remote),
642 std::move(audio_forwarder_config.value().audio_forwarder),
643 audio_forwarder_config.value().channel_count,
644 audio_forwarder_config.value().sample_rate, std::move(options),
645 config.continuous);
646
647 // The session is managed by the speech recognition service directly thus
648 // does not need to be associated with a session id in the browser.
649 return 0;
650 }
651#endif //! BUILDFLAG(IS_FUCHSIA)
652
653 std::unique_ptr<SpeechRecognitionEngine> speech_recognition_engine;
654
655#if !BUILDFLAG(IS_FUCHSIA)
656 if (UseOnDeviceSpeechRecognition(config)) {
657 std::unique_ptr<SodaSpeechRecognitionEngineImpl>
658 soda_speech_recognition_engine =
659 std::make_unique<SodaSpeechRecognitionEngineImpl>(config);
660 if (soda_speech_recognition_engine->Initialize()) {
661 speech_recognition_engine = std::move(soda_speech_recognition_engine);
662 }
663 }
664#endif //! BUILDFLAG(IS_FUCHSIA)
665
666 if (!speech_recognition_engine) {
667 // A NetworkSpeechRecognitionEngineImpl (and corresponding Config) is
668 // required only when using SpeechRecognizerImpl, which performs the audio
669 // capture and endpointing in the browser. This is not the case of Android
670 // where, not only the speech recognition, but also the audio capture and
671 // endpointing activities performed outside of the browser (delegated via
672 // JNI to the Android API implementation).
673
674 NetworkSpeechRecognitionEngineImpl::Config remote_engine_config;
675 remote_engine_config.language = config.language;
676 remote_engine_config.grammars = config.grammars;
677 remote_engine_config.audio_sample_rate =
678 audio_forwarder_config.has_value()
679 ? audio_forwarder_config.value().sample_rate
680 : SpeechRecognizerImpl::kAudioSampleRate;
681 remote_engine_config.audio_num_bits_per_sample =
682 SpeechRecognizerImpl::kNumBitsPerAudioSample;
683 remote_engine_config.filter_profanities = config.filter_profanities;
684 remote_engine_config.continuous = config.continuous;
685 remote_engine_config.interim_results = config.interim_results;
686 remote_engine_config.max_hypotheses = config.max_hypotheses;
687 remote_engine_config.origin_url = config.origin.Serialize();
688 remote_engine_config.auth_token = config.auth_token;
689 remote_engine_config.auth_scope = config.auth_scope;
690 remote_engine_config.preamble = config.preamble;
691
692 std::unique_ptr<NetworkSpeechRecognitionEngineImpl> google_remote_engine =
693 std::make_unique<NetworkSpeechRecognitionEngineImpl>(
694 config.shared_url_loader_factory);
695 google_remote_engine->SetConfig(remote_engine_config);
696 speech_recognition_engine = std::move(google_remote_engine);
697 }
698
699 session->recognizer = new SpeechRecognizerImpl(
700 this, audio_system_, session_id, config.continuous,
701 config.interim_results, std::move(speech_recognition_engine),
702 audio_forwarder_config.has_value()
703 ? std::make_optional<SpeechRecognitionAudioForwarderConfig>(
704 audio_forwarder_config.value())
705 : std::nullopt);
706
707#else
708 session->recognizer = new SpeechRecognizerImplAndroid(this, session_id);
709#endif //! BUILDFLAG(IS_ANDROID)
710
711 sessions_[session_id] = std::move(session);
712
713 GetUIThreadTaskRunner({})->PostTask(
714 FROM_HERE,
715 base::BindOnce(
716 &FrameSessionTracker::CreateObserverForSession,
717 config.initial_context.render_process_id,
718 config.initial_context.render_frame_id, session_id,
719 base::BindRepeating(&SpeechRecognitionManagerImpl::AbortSessionImpl,
720 weak_factory_.GetWeakPtr())));
721
722 return session_id;
723}
724
[email protected]b450e902012-04-25 20:20:18725void SpeechRecognitionManagerImpl::OnRecognitionEnd(int session_id) {
mostynb042582e2015-03-16 22:13:40726 DCHECK_CURRENTLY_ON(BrowserThread::IO);
[email protected]b450e902012-04-25 20:20:18727 if (!SessionExists(session_id))
728 return;
729
[email protected]12f4fb92012-05-16 10:30:16730 if (SpeechRecognitionEventListener* delegate_listener = GetDelegateListener())
731 delegate_listener->OnRecognitionEnd(session_id);
732 if (SpeechRecognitionEventListener* listener = GetListener(session_id))
733 listener->OnRecognitionEnd(session_id);
Sean Maher5b9af51f2022-11-21 15:32:47734 base::SingleThreadTaskRunner::GetCurrentDefault()->PostTask(
tzike2aca992017-09-05 08:50:54735 FROM_HERE, base::BindOnce(&SpeechRecognitionManagerImpl::DispatchEvent,
736 weak_factory_.GetWeakPtr(), session_id,
737 EVENT_RECOGNITION_ENDED));
[email protected]b450e902012-04-25 20:20:18738}
739
Lucas Furukawa Gadani0a70be92019-06-28 17:31:26740SpeechRecognitionSessionContext SpeechRecognitionManagerImpl::GetSessionContext(
741 int session_id) {
[email protected]2b94cfe2013-06-07 13:12:53742 return GetSession(session_id)->context;
[email protected]b450e902012-04-25 20:20:18743}
744
Evan Liu881ab7a2024-08-01 21:54:51745bool SpeechRecognitionManagerImpl::UseOnDeviceSpeechRecognition(
746 const SpeechRecognitionSessionConfig& config) {
747#if !BUILDFLAG(IS_FUCHSIA) && !BUILDFLAG(IS_ANDROID)
Evan Liub2b7f6d2024-08-05 22:55:48748 return config.on_device &&
Evan Liu68919fc2025-03-06 20:38:18749 (speech::IsOnDeviceSpeechRecognitionAvailable(config.language) ==
750 media::mojom::AvailabilityStatus::kAvailable ||
Evan Liub2b7f6d2024-08-05 22:55:48751 !config.allow_cloud_fallback);
Evan Liu881ab7a2024-08-01 21:54:51752#else
753 return false;
754#endif
755}
756
Avi Drissman6ed86dc32018-03-03 00:29:24757void SpeechRecognitionManagerImpl::AbortAllSessionsForRenderFrame(
[email protected]07c45dd2012-07-06 12:10:34758 int render_process_id,
Avi Drissman6ed86dc32018-03-03 00:29:24759 int render_frame_id) {
mostynb042582e2015-03-16 22:13:40760 DCHECK_CURRENTLY_ON(BrowserThread::IO);
Avi Drissman6ed86dc32018-03-03 00:29:24761
Avi Drissmane6d65502018-02-13 16:21:58762 for (const auto& session_pair : sessions_) {
763 Session* session = session_pair.second.get();
[email protected]2b94cfe2013-06-07 13:12:53764 if (session->context.render_process_id == render_process_id &&
Avi Drissman6ed86dc32018-03-03 00:29:24765 session->context.render_frame_id == render_frame_id) {
[email protected]2b94cfe2013-06-07 13:12:53766 AbortSession(session->id);
[email protected]07c45dd2012-07-06 12:10:34767 }
768 }
769}
770
[email protected]b450e902012-04-25 20:20:18771// ----------------------- Core FSM implementation ---------------------------
772void SpeechRecognitionManagerImpl::DispatchEvent(int session_id,
[email protected]12f4fb92012-05-16 10:30:16773 FSMEvent event) {
mostynb042582e2015-03-16 22:13:40774 DCHECK_CURRENTLY_ON(BrowserThread::IO);
[email protected]d305111d2012-05-24 10:58:24775
776 // There are some corner cases in which the session might be deleted (due to
777 // an EndRecognition event) between a request (e.g. Abort) and its dispatch.
778 if (!SessionExists(session_id))
779 return;
780
[email protected]2b94cfe2013-06-07 13:12:53781 Session* session = GetSession(session_id);
[email protected]12f4fb92012-05-16 10:30:16782 FSMState session_state = GetSessionState(session_id);
783 DCHECK_LE(session_state, SESSION_STATE_MAX_VALUE);
784 DCHECK_LE(event, EVENT_MAX_VALUE);
[email protected]b450e902012-04-25 20:20:18785
786 // Event dispatching must be sequential, otherwise it will break all the rules
787 // and the assumptions of the finite state automata model.
788 DCHECK(!is_dispatching_event_);
789 is_dispatching_event_ = true;
[email protected]12f4fb92012-05-16 10:30:16790 ExecuteTransitionAndGetNextState(session, session_state, event);
[email protected]b450e902012-04-25 20:20:18791 is_dispatching_event_ = false;
792}
793
794// This FSM handles the evolution of each session, from the viewpoint of the
795// interaction with the user (that may be either the browser end-user which
Avi Drissman6ed86dc32018-03-03 00:29:24796// interacts with UI bubbles, or JS developer interacting with JS methods).
[email protected]c91bb262012-06-27 10:56:45797// All the events received by the SpeechRecognizer instances (one for each
[email protected]b450e902012-04-25 20:20:18798// session) are always routed to the SpeechRecognitionEventListener(s)
799// regardless the choices taken in this FSM.
[email protected]12f4fb92012-05-16 10:30:16800void SpeechRecognitionManagerImpl::ExecuteTransitionAndGetNextState(
[email protected]2b94cfe2013-06-07 13:12:53801 Session* session, FSMState session_state, FSMEvent event) {
[email protected]12f4fb92012-05-16 10:30:16802 // Note: since we're not tracking the state of the recognizer object, rather
803 // we're directly retrieving it (through GetSessionState), we see its events
804 // (that are AUDIO_ENDED and RECOGNITION_ENDED) after its state evolution
805 // (e.g., when we receive the AUDIO_ENDED event, the recognizer has just
806 // completed the transition from CAPTURING_AUDIO to WAITING_FOR_RESULT, thus
807 // we perceive the AUDIO_ENDED event in WAITING_FOR_RESULT).
808 // This makes the code below a bit tricky but avoids a lot of code for
809 // tracking and reconstructing asynchronously the state of the recognizer.
810 switch (session_state) {
811 case SESSION_STATE_IDLE:
[email protected]b450e902012-04-25 20:20:18812 switch (event) {
813 case EVENT_START:
[email protected]2b94cfe2013-06-07 13:12:53814 return SessionStart(*session);
Yiren Wang23998562025-01-28 21:31:05815 case EVENT_UPDATE_RECOGNITION_CONTEXT:
816 return SessionUpdateRecognitionContext(*session);
[email protected]b450e902012-04-25 20:20:18817 case EVENT_ABORT:
[email protected]2b94cfe2013-06-07 13:12:53818 return SessionAbort(*session);
[email protected]b450e902012-04-25 20:20:18819 case EVENT_RECOGNITION_ENDED:
[email protected]12f4fb92012-05-16 10:30:16820 return SessionDelete(session);
821 case EVENT_STOP_CAPTURE:
[email protected]2b94cfe2013-06-07 13:12:53822 return SessionStopAudioCapture(*session);
[email protected]12f4fb92012-05-16 10:30:16823 case EVENT_AUDIO_ENDED:
824 return;
[email protected]b450e902012-04-25 20:20:18825 }
826 break;
[email protected]12f4fb92012-05-16 10:30:16827 case SESSION_STATE_CAPTURING_AUDIO:
[email protected]b450e902012-04-25 20:20:18828 switch (event) {
Yiren Wang23998562025-01-28 21:31:05829 case EVENT_UPDATE_RECOGNITION_CONTEXT:
830 return SessionUpdateRecognitionContext(*session);
[email protected]b450e902012-04-25 20:20:18831 case EVENT_STOP_CAPTURE:
[email protected]2b94cfe2013-06-07 13:12:53832 return SessionStopAudioCapture(*session);
[email protected]b450e902012-04-25 20:20:18833 case EVENT_ABORT:
[email protected]2b94cfe2013-06-07 13:12:53834 return SessionAbort(*session);
[email protected]b450e902012-04-25 20:20:18835 case EVENT_START:
[email protected]12f4fb92012-05-16 10:30:16836 return;
837 case EVENT_AUDIO_ENDED:
838 case EVENT_RECOGNITION_ENDED:
[email protected]2b94cfe2013-06-07 13:12:53839 return NotFeasible(*session, event);
[email protected]b450e902012-04-25 20:20:18840 }
841 break;
[email protected]12f4fb92012-05-16 10:30:16842 case SESSION_STATE_WAITING_FOR_RESULT:
[email protected]b450e902012-04-25 20:20:18843 switch (event) {
Yiren Wang23998562025-01-28 21:31:05844 case EVENT_UPDATE_RECOGNITION_CONTEXT:
845 return SessionUpdateRecognitionContext(*session);
[email protected]b450e902012-04-25 20:20:18846 case EVENT_ABORT:
[email protected]2b94cfe2013-06-07 13:12:53847 return SessionAbort(*session);
[email protected]12f4fb92012-05-16 10:30:16848 case EVENT_AUDIO_ENDED:
[email protected]2b94cfe2013-06-07 13:12:53849 return ResetCapturingSessionId(*session);
[email protected]b450e902012-04-25 20:20:18850 case EVENT_START:
851 case EVENT_STOP_CAPTURE:
[email protected]12f4fb92012-05-16 10:30:16852 return;
[email protected]b450e902012-04-25 20:20:18853 case EVENT_RECOGNITION_ENDED:
[email protected]2b94cfe2013-06-07 13:12:53854 return NotFeasible(*session, event);
[email protected]b450e902012-04-25 20:20:18855 }
856 break;
857 }
[email protected]2b94cfe2013-06-07 13:12:53858 return NotFeasible(*session, event);
[email protected]12f4fb92012-05-16 10:30:16859}
860
861SpeechRecognitionManagerImpl::FSMState
862SpeechRecognitionManagerImpl::GetSessionState(int session_id) const {
[email protected]2b94cfe2013-06-07 13:12:53863 Session* session = GetSession(session_id);
864 if (!session->recognizer.get() || !session->recognizer->IsActive())
[email protected]12f4fb92012-05-16 10:30:16865 return SESSION_STATE_IDLE;
[email protected]2b94cfe2013-06-07 13:12:53866 if (session->recognizer->IsCapturingAudio())
[email protected]12f4fb92012-05-16 10:30:16867 return SESSION_STATE_CAPTURING_AUDIO;
868 return SESSION_STATE_WAITING_FOR_RESULT;
[email protected]b450e902012-04-25 20:20:18869}
870
871// ----------- Contract for all the FSM evolution functions below -------------
872// - Are guaranteed to be executed in the IO thread;
873// - Are guaranteed to be not reentrant (themselves and each other);
[email protected]b450e902012-04-25 20:20:18874
[email protected]12f4fb92012-05-16 10:30:16875void SpeechRecognitionManagerImpl::SessionStart(const Session& session) {
Guido Urdaneta73fa6632019-01-14 18:46:26876 const blink::MediaStreamDevices& devices = session.context.devices;
[email protected]579d6992013-06-22 13:40:20877 std::string device_id;
878 if (devices.empty()) {
879 // From the ask_user=false path, use the default device.
880 // TODO(xians): Abort the session after we do not need to support this path
881 // anymore.
olka251dd5692016-04-27 15:50:17882 device_id = media::AudioDeviceDescription::kDefaultDeviceId;
[email protected]579d6992013-06-22 13:40:20883 } else {
884 // From the ask_user=true path, use the selected device.
885 DCHECK_EQ(1u, devices.size());
Antonio Gomesc8b734b2019-06-05 18:22:16886 DCHECK_EQ(blink::mojom::MediaStreamType::DEVICE_AUDIO_CAPTURE,
887 devices.front().type);
[email protected]579d6992013-06-22 13:40:20888 device_id = devices.front().id;
889 }
[email protected]76f9f04e2013-06-20 06:38:23890
[email protected]579d6992013-06-22 13:40:20891 session.recognizer->StartRecognition(device_id);
[email protected]b450e902012-04-25 20:20:18892}
893
Yiren Wang23998562025-01-28 21:31:05894void SpeechRecognitionManagerImpl::SessionUpdateRecognitionContext(
895 const Session& session) {
896 CHECK(session.recognizer.get());
897 session.recognizer->UpdateRecognitionContext(session.recognition_context);
898}
899
[email protected]12f4fb92012-05-16 10:30:16900void SpeechRecognitionManagerImpl::SessionAbort(const Session& session) {
Evan Liu1c0164d2025-02-12 20:40:36901 if (microphone_session_id_ == session.id) {
902 microphone_session_id_ = kSessionIDInvalid;
903 }
[email protected]2af35c502012-09-13 20:14:43904 DCHECK(session.recognizer.get());
[email protected]b450e902012-04-25 20:20:18905 session.recognizer->AbortRecognition();
[email protected]b450e902012-04-25 20:20:18906}
907
[email protected]12f4fb92012-05-16 10:30:16908void SpeechRecognitionManagerImpl::SessionStopAudioCapture(
909 const Session& session) {
[email protected]2af35c502012-09-13 20:14:43910 DCHECK(session.recognizer.get());
[email protected]12f4fb92012-05-16 10:30:16911 session.recognizer->StopAudioCapture();
[email protected]b450e902012-04-25 20:20:18912}
913
[email protected]12f4fb92012-05-16 10:30:16914void SpeechRecognitionManagerImpl::ResetCapturingSessionId(
915 const Session& session) {
Evan Liu1c0164d2025-02-12 20:40:36916 microphone_session_id_ = kSessionIDInvalid;
[email protected]b450e902012-04-25 20:20:18917}
918
[email protected]2b94cfe2013-06-07 13:12:53919void SpeechRecognitionManagerImpl::SessionDelete(Session* session) {
Ivan Kotenkov2c0d2bb32017-11-01 15:41:28920 DCHECK(session->recognizer.get() == nullptr ||
921 !session->recognizer->IsActive());
Evan Liu1c0164d2025-02-12 20:40:36922 if (microphone_session_id_ == session->id) {
923 microphone_session_id_ = kSessionIDInvalid;
924 }
[email protected]2542d88a2013-09-30 15:41:07925 if (!session->context.label.empty())
926 media_stream_manager_->CancelRequest(session->context.label);
[email protected]2b94cfe2013-06-07 13:12:53927 sessions_.erase(session->id);
[email protected]b450e902012-04-25 20:20:18928}
929
[email protected]12f4fb92012-05-16 10:30:16930void SpeechRecognitionManagerImpl::NotFeasible(const Session& session,
931 FSMEvent event) {
Peter Boströmfc7ddc182024-10-31 19:37:21932 NOTREACHED() << "Unfeasible event " << event << " in state "
933 << GetSessionState(session.id) << " for session " << session.id;
[email protected]b450e902012-04-25 20:20:18934}
935
936int SpeechRecognitionManagerImpl::GetNextSessionID() {
937 ++last_session_id_;
938 // Deal with wrapping of last_session_id_. (How civilized).
939 if (last_session_id_ <= 0)
940 last_session_id_ = 1;
941 return last_session_id_;
942}
943
944bool SpeechRecognitionManagerImpl::SessionExists(int session_id) const {
945 return sessions_.find(session_id) != sessions_.end();
946}
947
[email protected]2b94cfe2013-06-07 13:12:53948SpeechRecognitionManagerImpl::Session*
[email protected]12f4fb92012-05-16 10:30:16949SpeechRecognitionManagerImpl::GetSession(int session_id) const {
mostynb042582e2015-03-16 22:13:40950 DCHECK_CURRENTLY_ON(BrowserThread::IO);
Avi Drissmane6d65502018-02-13 16:21:58951 auto iter = sessions_.find(session_id);
Daniel Cheng4d54f0a2025-05-26 22:59:12952 CHECK(iter != sessions_.end());
Avi Drissmane6d65502018-02-13 16:21:58953 return iter->second.get();
[email protected]b450e902012-04-25 20:20:18954}
955
[email protected]12f4fb92012-05-16 10:30:16956SpeechRecognitionEventListener* SpeechRecognitionManagerImpl::GetListener(
957 int session_id) const {
[email protected]2b94cfe2013-06-07 13:12:53958 Session* session = GetSession(session_id);
Avi Drissman6ed86dc32018-03-03 00:29:24959 if (session->config.event_listener)
[email protected]e2eb2f22014-02-20 14:36:00960 return session->config.event_listener.get();
Ivan Kotenkov2c0d2bb32017-11-01 15:41:28961 return nullptr;
[email protected]12f4fb92012-05-16 10:30:16962}
963
964SpeechRecognitionEventListener*
965SpeechRecognitionManagerImpl::GetDelegateListener() const {
Ivan Kotenkov2c0d2bb32017-11-01 15:41:28966 return delegate_.get() ? delegate_->GetEventListener() : nullptr;
[email protected]12f4fb92012-05-16 10:30:16967}
968
969const SpeechRecognitionSessionConfig&
Lucas Furukawa Gadani0a70be92019-06-28 17:31:26970SpeechRecognitionManagerImpl::GetSessionConfig(int session_id) {
[email protected]2b94cfe2013-06-07 13:12:53971 return GetSession(session_id)->config;
[email protected]12f4fb92012-05-16 10:30:16972}
[email protected]b450e902012-04-25 20:20:18973
[email protected]b450e902012-04-25 20:20:18974SpeechRecognitionManagerImpl::Session::Session()
Avi Drissman6ed86dc32018-03-03 00:29:24975 : id(kSessionIDInvalid), abort_requested(false) {}
[email protected]0944a7292011-09-21 16:45:06976
[email protected]b450e902012-04-25 20:20:18977SpeechRecognitionManagerImpl::Session::~Session() {
[email protected]84c13c032011-09-23 00:12:22978}
979
[email protected]fcb8e0212012-10-29 11:57:18980} // namespace content