Blame - content/browser/speech/speech_recognizer_impl.h - chromium/src

blob: 2bbd6d5d58eb60c8d6e12e6b93eb74ee615c83da [file] [log] [blame]

Avi Drissman	4e1b7bc3	2022-09-15 14:03:50	[diff] [blame]	1	// Copyright 2013 The Chromium Authors
[email protected]	ce1adc34	2013-05-20 13:35:43	[diff] [blame]	2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_
				6	#define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_
				7
dcheng	5971627	2016-04-09 05:19:08	[diff] [blame]	8	#include <memory>
maxmorin	3500cac	2017-02-03 10:14:27	[diff] [blame]	9	#include <string>
dcheng	5971627	2016-04-09 05:19:08	[diff] [blame]	10
Keishi Hattori	0e45c02	2021-11-27 09:25:52	[diff] [blame]	11	#include "base/memory/raw_ptr.h"
olka	ef762c9	2017-02-06 16:45:16	[diff] [blame]	12	#include "base/memory/weak_ptr.h"
Evan Liu	90a83152	2024-05-20 18:20:02	[diff] [blame]	13	#include "components/speech/endpointer/endpointer.h"
Evan Liu	1f45824	2024-07-24 22:12:20	[diff] [blame]	14	#include "components/speech/speech_recognizer_fsm.h"
[email protected]	ce1adc34	2013-05-20 13:35:43	[diff] [blame]	15	#include "content/browser/speech/speech_recognition_engine.h"
				16	#include "content/browser/speech/speech_recognizer.h"
Lei Zhang	7ab31375	2021-11-17 01:26:00	[diff] [blame]	17	#include "content/common/content_export.h"
Marina Ciocea	68e948e	2018-05-10 22:26:17	[diff] [blame]	18	#include "media/base/audio_capturer_source.h"
Evan Liu	881ab7a	2024-08-01 21:54:51	[diff] [blame]	19	#include "media/mojo/mojom/audio_data.mojom.h"
				20	#include "media/mojo/mojom/speech_recognition.mojom.h"
				21	#include "media/mojo/mojom/speech_recognition_audio_forwarder.mojom.h"
Evan Liu	d725228	2024-05-16 20:10:32	[diff] [blame]	22	#include "media/mojo/mojom/speech_recognition_error.mojom.h"
				23	#include "media/mojo/mojom/speech_recognition_result.mojom.h"
Evan Liu	881ab7a	2024-08-01 21:54:51	[diff] [blame]	24	#include "media/mojo/mojom/speech_recognizer.mojom.h"
				25	#include "mojo/public/cpp/bindings/receiver.h"
[email protected]	ce1adc34	2013-05-20 13:35:43	[diff] [blame]	26
				27	namespace media {
[email protected]	f361a0d	2014-06-19 12:38:56	[diff] [blame]	28	class AudioBus;
olka	ef762c9	2017-02-06 16:45:16	[diff] [blame]	29	class AudioSystem;
Guido Urdaneta	35a8698	2018-02-20 13:34:38	[diff] [blame]	30	} // namespace media
[email protected]	ce1adc34	2013-05-20 13:35:43	[diff] [blame]	31
				32	namespace content {
				33
				34	class SpeechRecognitionEventListener;
Evan Liu	881ab7a	2024-08-01 21:54:51	[diff] [blame]	35	struct SpeechRecognitionAudioForwarderConfig;
[email protected]	ce1adc34	2013-05-20 13:35:43	[diff] [blame]	36
				37	// Handles speech recognition for a session (identified by \|session_id\|), taking
				38	// care of audio capture, silence detection/endpointer and interaction with the
				39	// SpeechRecognitionEngine.
				40	class CONTENT_EXPORT SpeechRecognizerImpl
				41	: public SpeechRecognizer,
Marina Ciocea	68e948e	2018-05-10 22:26:17	[diff] [blame]	42	public media::AudioCapturerSource::CaptureCallback,
Evan Liu	1f45824	2024-07-24 22:12:20	[diff] [blame]	43	public SpeechRecognitionEngine::Delegate,
Evan Liu	881ab7a	2024-08-01 21:54:51	[diff] [blame]	44	public speech::SpeechRecognizerFsm,
				45	public media::mojom::SpeechRecognitionAudioForwarder {
[email protected]	ce1adc34	2013-05-20 13:35:43	[diff] [blame]	46	public:
Raul Tambre	b1da244	2019-04-07 18:18:03	[diff] [blame]	47	static constexpr int kAudioSampleRate = 16000;
				48	static constexpr media::ChannelLayout kChannelLayout =
				49	media::CHANNEL_LAYOUT_MONO;
				50	static constexpr int kNumBitsPerAudioSample = 16;
				51	static constexpr int kNoSpeechTimeoutMs = 8000;
				52	static constexpr int kEndpointerEstimationTimeMs = 300;
[email protected]	ce1adc34	2013-05-20 13:35:43	[diff] [blame]	53
Marina Ciocea	68e948e	2018-05-10 22:26:17	[diff] [blame]	54	static void SetAudioEnvironmentForTesting(
				55	media::AudioSystem* audio_system,
				56	media::AudioCapturerSource* capturer_source);
[email protected]	ce1adc34	2013-05-20 13:35:43	[diff] [blame]	57
				58	SpeechRecognizerImpl(SpeechRecognitionEventListener* listener,
olka	ef762c9	2017-02-06 16:45:16	[diff] [blame]	59	media::AudioSystem* audio_system,
[email protected]	ce1adc34	2013-05-20 13:35:43	[diff] [blame]	60	int session_id,
[email protected]	6073144	2014-01-16 04:59:59	[diff] [blame]	61	bool continuous,
				62	bool provisional_results,
Evan Liu	881ab7a	2024-08-01 21:54:51	[diff] [blame]	63	std::unique_ptr<SpeechRecognitionEngine> engine,
				64	std::optional<SpeechRecognitionAudioForwarderConfig>
				65	audio_forwarder_config);
[email protected]	ce1adc34	2013-05-20 13:35:43	[diff] [blame]	66
Peter Boström	9b03653	2021-10-28 23:37:28	[diff] [blame]	67	SpeechRecognizerImpl(const SpeechRecognizerImpl&) = delete;
				68	SpeechRecognizerImpl& operator=(const SpeechRecognizerImpl&) = delete;
				69
Marina Ciocea	68e948e	2018-05-10 22:26:17	[diff] [blame]	70	// SpeechRecognizer methods.
dcheng	c2282aa	2014-10-21 12:07:58	[diff] [blame]	71	void StartRecognition(const std::string& device_id) override;
Yiren Wang	2399856	2025-01-28 21:31:05	[diff] [blame]	72	void UpdateRecognitionContext(
				73	const media::SpeechRecognitionRecognitionContext& recognition_context)
				74	override;
dcheng	c2282aa	2014-10-21 12:07:58	[diff] [blame]	75	void AbortRecognition() override;
				76	void StopAudioCapture() override;
				77	bool IsActive() const override;
				78	bool IsCapturingAudio() const override;
Marina Ciocea	68e948e	2018-05-10 22:26:17	[diff] [blame]	79
[email protected]	ce1adc34	2013-05-20 13:35:43	[diff] [blame]	80	const SpeechRecognitionEngine& recognition_engine() const;
				81
				82	private:
				83	friend class SpeechRecognizerTest;
				84
dcheng	c2282aa	2014-10-21 12:07:58	[diff] [blame]	85	~SpeechRecognizerImpl() override;
[email protected]	ce1adc34	2013-05-20 13:35:43	[diff] [blame]	86
olka	ef762c9	2017-02-06 16:45:16	[diff] [blame]	87	// Callback from AudioSystem.
Evan Liu	881ab7a	2024-08-01 21:54:51	[diff] [blame]	88	void OnAudioParametersReceived(
				89	const std::optional<media::AudioParameters>& params);
olka	ef762c9	2017-02-06 16:45:16	[diff] [blame]	90
Evan Liu	1f45824	2024-07-24 22:12:20	[diff] [blame]	91	// speech::SpeechRecognizerFsm implementation.
				92	// Process a new audio chunk in the audio pipeline (endpointer, vumeter, etc).
				93	void DispatchEvent(const FSMEventArgs& event_args) override;
				94	void ProcessAudioPipeline(const FSMEventArgs& event_args) override;
				95	FSMState PrepareRecognition(const FSMEventArgs&) override;
				96	FSMState StartRecording(const FSMEventArgs& event_args) override;
				97	FSMState StartRecognitionEngine(const FSMEventArgs& event_args) override;
				98	FSMState WaitEnvironmentEstimationCompletion(
				99	const FSMEventArgs& event_args) override;
				100	FSMState DetectUserSpeechOrTimeout(const FSMEventArgs& event_args) override;
				101	FSMState StopCaptureAndWaitForResult(const FSMEventArgs& event_args) override;
				102	FSMState ProcessIntermediateResult(const FSMEventArgs& event_args) override;
				103	FSMState ProcessFinalResult(const FSMEventArgs& event_args) override;
				104	FSMState AbortSilently(const FSMEventArgs& event_args) override;
				105	FSMState AbortWithError(const FSMEventArgs& event_args) override;
				106	FSMState Abort(const media::mojom::SpeechRecognitionError& error) override;
				107	FSMState DetectEndOfSpeech(const FSMEventArgs& event_args) override;
Yiren Wang	2399856	2025-01-28 21:31:05	[diff] [blame]	108	FSMState UpdateRecognitionContext(const FSMEventArgs& event_args) override;
Evan Liu	1f45824	2024-07-24 22:12:20	[diff] [blame]	109	FSMState DoNothing(const FSMEventArgs& event_args) const override;
				110	FSMState NotFeasible(const FSMEventArgs& event_args) override;
[email protected]	ce1adc34	2013-05-20 13:35:43	[diff] [blame]	111
				112	// Returns the time span of captured audio samples since the start of capture.
				113	int GetElapsedTimeMs() const;
				114
				115	// Calculates the input volume to be displayed in the UI, triggering the
				116	// OnAudioLevelsChange event accordingly.
				117	void UpdateSignalAndNoiseLevels(const float& rms, bool clip_detected);
				118
Marina Ciocea	68e948e	2018-05-10 22:26:17	[diff] [blame]	119	void CloseAudioCapturerSource();
[email protected]	ce1adc34	2013-05-20 13:35:43	[diff] [blame]	120
Marina Ciocea	68e948e	2018-05-10 22:26:17	[diff] [blame]	121	// media::AudioCapturerSource::CaptureCallback methods.
				122	void OnCaptureStarted() final {}
				123	void Capture(const media::AudioBus* audio_bus,
Jeroen Dhollander	6be574b	2019-07-17 18:48:23	[diff] [blame]	124	base::TimeTicks audio_capture_time,
Fredrik Hernqvist	35a8135	2024-03-04 15:03:55	[diff] [blame]	125	const media::AudioGlitchInfo& glitch_info,
Fredrik Hernqvist	590f6576	2024-11-14 13:42:15	[diff] [blame]	126	double volume) final;
Tony Herre	003731d	2021-05-25 22:09:56	[diff] [blame]	127	void OnCaptureError(media::AudioCapturerSource::ErrorCode code,
				128	const std::string& message) final;
Marina Ciocea	68e948e	2018-05-10 22:26:17	[diff] [blame]	129	void OnCaptureMuted(bool is_muted) final {}
tommi	ce9a251	2017-01-16 18:35:18	[diff] [blame]	130
Evan Liu	881ab7a	2024-08-01 21:54:51	[diff] [blame]	131	// media::mojom::blink::SpeechRecognitionAudioForwarder methods.
				132	void AddAudioFromRenderer(media::mojom::AudioDataS16Ptr buffer) override;
				133
[email protected]	ce1adc34	2013-05-20 13:35:43	[diff] [blame]	134	// SpeechRecognitionEngineDelegate methods.
dcheng	c2282aa	2014-10-21 12:07:58	[diff] [blame]	135	void OnSpeechRecognitionEngineResults(
Evan Liu	d725228	2024-05-16 20:10:32	[diff] [blame]	136	const std::vector<media::mojom::WebSpeechRecognitionResultPtr>& results)
Adithya Srinivasan	c35bf396	2018-06-12 14:28:14	[diff] [blame]	137	override;
gshires	78afd3eb	2015-09-26 01:32:56	[diff] [blame]	138	void OnSpeechRecognitionEngineEndOfUtterance() override;
dcheng	c2282aa	2014-10-21 12:07:58	[diff] [blame]	139	void OnSpeechRecognitionEngineError(
Evan Liu	d725228	2024-05-16 20:10:32	[diff] [blame]	140	const media::mojom::SpeechRecognitionError& error) override;
[email protected]	ce1adc34	2013-05-20 13:35:43	[diff] [blame]	141
olka	ef762c9	2017-02-06 16:45:16	[diff] [blame]	142	media::AudioSystem* GetAudioSystem();
Marina Ciocea	68e948e	2018-05-10 22:26:17	[diff] [blame]	143	void CreateAudioCapturerSource();
				144	media::AudioCapturerSource* GetAudioCapturerSource();
[email protected]	ce1adc34	2013-05-20 13:35:43	[diff] [blame]	145
Marina Ciocea	68e948e	2018-05-10 22:26:17	[diff] [blame]	146	// Substitute the real audio system and capturer source in browser tests.
olka	ef762c9	2017-02-06 16:45:16	[diff] [blame]	147	static media::AudioSystem* audio_system_for_tests_;
Marina Ciocea	68e948e	2018-05-10 22:26:17	[diff] [blame]	148	static media::AudioCapturerSource* audio_capturer_source_for_tests_;
				149
Arthur Sonzogni	e98d214	2023-06-01 15:02:25	[diff] [blame]	150	raw_ptr<media::AudioSystem, DanglingUntriaged> audio_system_;
dcheng	5971627	2016-04-09 05:19:08	[diff] [blame]	151	std::unique_ptr<SpeechRecognitionEngine> recognition_engine_;
Evan Liu	881ab7a	2024-08-01 21:54:51	[diff] [blame]	152	int sample_rate_;
Evan Liu	90a83152	2024-05-20 18:20:02	[diff] [blame]	153	speech::Endpointer endpointer_;
Marina Ciocea	68e948e	2018-05-10 22:26:17	[diff] [blame]	154	scoped_refptr<media::AudioCapturerSource> audio_capturer_source_;
[email protected]	ce1adc34	2013-05-20 13:35:43	[diff] [blame]	155	int num_samples_recorded_;
				156	float audio_level_;
[email protected]	6073144	2014-01-16 04:59:59	[diff] [blame]	157	bool provisional_results_;
gshires	78afd3eb	2015-09-26 01:32:56	[diff] [blame]	158	bool end_of_utterance_;
[email protected]	76f9f04e	2013-06-20 06:38:23	[diff] [blame]	159	std::string device_id_;
Evan Liu	881ab7a	2024-08-01 21:54:51	[diff] [blame]	160	media::AudioParameters audio_parameters_;
				161	bool use_audio_capturer_source_ = true;
				162	mojo::Receiver<media::mojom::SpeechRecognitionAudioForwarder>
				163	audio_forwarder_receiver_;
olka	ef762c9	2017-02-06 16:45:16	[diff] [blame]	164	media::AudioParameters device_params_;
[email protected]	ce1adc34	2013-05-20 13:35:43	[diff] [blame]	165
[email protected]	2e50f6d7	2013-06-17 14:41:38	[diff] [blame]	166	class OnDataConverter;
				167
				168	// Converts data between native input format and a WebSpeech specific
				169	// output format.
dcheng	5971627	2016-04-09 05:19:08	[diff] [blame]	170	std::unique_ptr<SpeechRecognizerImpl::OnDataConverter> audio_converter_;
[email protected]	2e50f6d7	2013-06-17 14:41:38	[diff] [blame]	171
Jeremy Roman	3bca4bf	2019-07-11 03:41:25	[diff] [blame]	172	base::WeakPtrFactory<SpeechRecognizerImpl> weak_ptr_factory_{this};
[email protected]	ce1adc34	2013-05-20 13:35:43	[diff] [blame]	173	};
				174
				175	} // namespace content
				176
				177	#endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_