Avi Drissman | 4e1b7bc3 | 2022-09-15 14:03:50 | [diff] [blame] | 1 | // Copyright 2013 The Chromium Authors |
[email protected] | ce1adc34 | 2013-05-20 13:35:43 | [diff] [blame] | 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_ |
| 6 | #define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_ |
| 7 | |
dcheng | 5971627 | 2016-04-09 05:19:08 | [diff] [blame] | 8 | #include <memory> |
maxmorin | 3500cac | 2017-02-03 10:14:27 | [diff] [blame] | 9 | #include <string> |
dcheng | 5971627 | 2016-04-09 05:19:08 | [diff] [blame] | 10 | |
Keishi Hattori | 0e45c02 | 2021-11-27 09:25:52 | [diff] [blame] | 11 | #include "base/memory/raw_ptr.h" |
olka | ef762c9 | 2017-02-06 16:45:16 | [diff] [blame] | 12 | #include "base/memory/weak_ptr.h" |
Evan Liu | 90a83152 | 2024-05-20 18:20:02 | [diff] [blame] | 13 | #include "components/speech/endpointer/endpointer.h" |
Evan Liu | 1f45824 | 2024-07-24 22:12:20 | [diff] [blame] | 14 | #include "components/speech/speech_recognizer_fsm.h" |
[email protected] | ce1adc34 | 2013-05-20 13:35:43 | [diff] [blame] | 15 | #include "content/browser/speech/speech_recognition_engine.h" |
| 16 | #include "content/browser/speech/speech_recognizer.h" |
Lei Zhang | 7ab31375 | 2021-11-17 01:26:00 | [diff] [blame] | 17 | #include "content/common/content_export.h" |
Marina Ciocea | 68e948e | 2018-05-10 22:26:17 | [diff] [blame] | 18 | #include "media/base/audio_capturer_source.h" |
Evan Liu | 881ab7a | 2024-08-01 21:54:51 | [diff] [blame] | 19 | #include "media/mojo/mojom/audio_data.mojom.h" |
| 20 | #include "media/mojo/mojom/speech_recognition.mojom.h" |
| 21 | #include "media/mojo/mojom/speech_recognition_audio_forwarder.mojom.h" |
Evan Liu | d725228 | 2024-05-16 20:10:32 | [diff] [blame] | 22 | #include "media/mojo/mojom/speech_recognition_error.mojom.h" |
| 23 | #include "media/mojo/mojom/speech_recognition_result.mojom.h" |
Evan Liu | 881ab7a | 2024-08-01 21:54:51 | [diff] [blame] | 24 | #include "media/mojo/mojom/speech_recognizer.mojom.h" |
| 25 | #include "mojo/public/cpp/bindings/receiver.h" |
[email protected] | ce1adc34 | 2013-05-20 13:35:43 | [diff] [blame] | 26 | |
| 27 | namespace media { |
[email protected] | f361a0d | 2014-06-19 12:38:56 | [diff] [blame] | 28 | class AudioBus; |
olka | ef762c9 | 2017-02-06 16:45:16 | [diff] [blame] | 29 | class AudioSystem; |
Guido Urdaneta | 35a8698 | 2018-02-20 13:34:38 | [diff] [blame] | 30 | } // namespace media |
[email protected] | ce1adc34 | 2013-05-20 13:35:43 | [diff] [blame] | 31 | |
| 32 | namespace content { |
| 33 | |
| 34 | class SpeechRecognitionEventListener; |
Evan Liu | 881ab7a | 2024-08-01 21:54:51 | [diff] [blame] | 35 | struct SpeechRecognitionAudioForwarderConfig; |
[email protected] | ce1adc34 | 2013-05-20 13:35:43 | [diff] [blame] | 36 | |
| 37 | // Handles speech recognition for a session (identified by |session_id|), taking |
| 38 | // care of audio capture, silence detection/endpointer and interaction with the |
| 39 | // SpeechRecognitionEngine. |
| 40 | class CONTENT_EXPORT SpeechRecognizerImpl |
| 41 | : public SpeechRecognizer, |
Marina Ciocea | 68e948e | 2018-05-10 22:26:17 | [diff] [blame] | 42 | public media::AudioCapturerSource::CaptureCallback, |
Evan Liu | 1f45824 | 2024-07-24 22:12:20 | [diff] [blame] | 43 | public SpeechRecognitionEngine::Delegate, |
Evan Liu | 881ab7a | 2024-08-01 21:54:51 | [diff] [blame] | 44 | public speech::SpeechRecognizerFsm, |
| 45 | public media::mojom::SpeechRecognitionAudioForwarder { |
[email protected] | ce1adc34 | 2013-05-20 13:35:43 | [diff] [blame] | 46 | public: |
Raul Tambre | b1da244 | 2019-04-07 18:18:03 | [diff] [blame] | 47 | static constexpr int kAudioSampleRate = 16000; |
| 48 | static constexpr media::ChannelLayout kChannelLayout = |
| 49 | media::CHANNEL_LAYOUT_MONO; |
| 50 | static constexpr int kNumBitsPerAudioSample = 16; |
| 51 | static constexpr int kNoSpeechTimeoutMs = 8000; |
| 52 | static constexpr int kEndpointerEstimationTimeMs = 300; |
[email protected] | ce1adc34 | 2013-05-20 13:35:43 | [diff] [blame] | 53 | |
Marina Ciocea | 68e948e | 2018-05-10 22:26:17 | [diff] [blame] | 54 | static void SetAudioEnvironmentForTesting( |
| 55 | media::AudioSystem* audio_system, |
| 56 | media::AudioCapturerSource* capturer_source); |
[email protected] | ce1adc34 | 2013-05-20 13:35:43 | [diff] [blame] | 57 | |
| 58 | SpeechRecognizerImpl(SpeechRecognitionEventListener* listener, |
olka | ef762c9 | 2017-02-06 16:45:16 | [diff] [blame] | 59 | media::AudioSystem* audio_system, |
[email protected] | ce1adc34 | 2013-05-20 13:35:43 | [diff] [blame] | 60 | int session_id, |
[email protected] | 6073144 | 2014-01-16 04:59:59 | [diff] [blame] | 61 | bool continuous, |
| 62 | bool provisional_results, |
Evan Liu | 881ab7a | 2024-08-01 21:54:51 | [diff] [blame] | 63 | std::unique_ptr<SpeechRecognitionEngine> engine, |
| 64 | std::optional<SpeechRecognitionAudioForwarderConfig> |
| 65 | audio_forwarder_config); |
[email protected] | ce1adc34 | 2013-05-20 13:35:43 | [diff] [blame] | 66 | |
Peter Boström | 9b03653 | 2021-10-28 23:37:28 | [diff] [blame] | 67 | SpeechRecognizerImpl(const SpeechRecognizerImpl&) = delete; |
| 68 | SpeechRecognizerImpl& operator=(const SpeechRecognizerImpl&) = delete; |
| 69 | |
Marina Ciocea | 68e948e | 2018-05-10 22:26:17 | [diff] [blame] | 70 | // SpeechRecognizer methods. |
dcheng | c2282aa | 2014-10-21 12:07:58 | [diff] [blame] | 71 | void StartRecognition(const std::string& device_id) override; |
Yiren Wang | 2399856 | 2025-01-28 21:31:05 | [diff] [blame] | 72 | void UpdateRecognitionContext( |
| 73 | const media::SpeechRecognitionRecognitionContext& recognition_context) |
| 74 | override; |
dcheng | c2282aa | 2014-10-21 12:07:58 | [diff] [blame] | 75 | void AbortRecognition() override; |
| 76 | void StopAudioCapture() override; |
| 77 | bool IsActive() const override; |
| 78 | bool IsCapturingAudio() const override; |
Marina Ciocea | 68e948e | 2018-05-10 22:26:17 | [diff] [blame] | 79 | |
[email protected] | ce1adc34 | 2013-05-20 13:35:43 | [diff] [blame] | 80 | const SpeechRecognitionEngine& recognition_engine() const; |
| 81 | |
| 82 | private: |
| 83 | friend class SpeechRecognizerTest; |
| 84 | |
dcheng | c2282aa | 2014-10-21 12:07:58 | [diff] [blame] | 85 | ~SpeechRecognizerImpl() override; |
[email protected] | ce1adc34 | 2013-05-20 13:35:43 | [diff] [blame] | 86 | |
olka | ef762c9 | 2017-02-06 16:45:16 | [diff] [blame] | 87 | // Callback from AudioSystem. |
Evan Liu | 881ab7a | 2024-08-01 21:54:51 | [diff] [blame] | 88 | void OnAudioParametersReceived( |
| 89 | const std::optional<media::AudioParameters>& params); |
olka | ef762c9 | 2017-02-06 16:45:16 | [diff] [blame] | 90 | |
Evan Liu | 1f45824 | 2024-07-24 22:12:20 | [diff] [blame] | 91 | // speech::SpeechRecognizerFsm implementation. |
| 92 | // Process a new audio chunk in the audio pipeline (endpointer, vumeter, etc). |
| 93 | void DispatchEvent(const FSMEventArgs& event_args) override; |
| 94 | void ProcessAudioPipeline(const FSMEventArgs& event_args) override; |
| 95 | FSMState PrepareRecognition(const FSMEventArgs&) override; |
| 96 | FSMState StartRecording(const FSMEventArgs& event_args) override; |
| 97 | FSMState StartRecognitionEngine(const FSMEventArgs& event_args) override; |
| 98 | FSMState WaitEnvironmentEstimationCompletion( |
| 99 | const FSMEventArgs& event_args) override; |
| 100 | FSMState DetectUserSpeechOrTimeout(const FSMEventArgs& event_args) override; |
| 101 | FSMState StopCaptureAndWaitForResult(const FSMEventArgs& event_args) override; |
| 102 | FSMState ProcessIntermediateResult(const FSMEventArgs& event_args) override; |
| 103 | FSMState ProcessFinalResult(const FSMEventArgs& event_args) override; |
| 104 | FSMState AbortSilently(const FSMEventArgs& event_args) override; |
| 105 | FSMState AbortWithError(const FSMEventArgs& event_args) override; |
| 106 | FSMState Abort(const media::mojom::SpeechRecognitionError& error) override; |
| 107 | FSMState DetectEndOfSpeech(const FSMEventArgs& event_args) override; |
Yiren Wang | 2399856 | 2025-01-28 21:31:05 | [diff] [blame] | 108 | FSMState UpdateRecognitionContext(const FSMEventArgs& event_args) override; |
Evan Liu | 1f45824 | 2024-07-24 22:12:20 | [diff] [blame] | 109 | FSMState DoNothing(const FSMEventArgs& event_args) const override; |
| 110 | FSMState NotFeasible(const FSMEventArgs& event_args) override; |
[email protected] | ce1adc34 | 2013-05-20 13:35:43 | [diff] [blame] | 111 | |
| 112 | // Returns the time span of captured audio samples since the start of capture. |
| 113 | int GetElapsedTimeMs() const; |
| 114 | |
| 115 | // Calculates the input volume to be displayed in the UI, triggering the |
| 116 | // OnAudioLevelsChange event accordingly. |
| 117 | void UpdateSignalAndNoiseLevels(const float& rms, bool clip_detected); |
| 118 | |
Marina Ciocea | 68e948e | 2018-05-10 22:26:17 | [diff] [blame] | 119 | void CloseAudioCapturerSource(); |
[email protected] | ce1adc34 | 2013-05-20 13:35:43 | [diff] [blame] | 120 | |
Marina Ciocea | 68e948e | 2018-05-10 22:26:17 | [diff] [blame] | 121 | // media::AudioCapturerSource::CaptureCallback methods. |
| 122 | void OnCaptureStarted() final {} |
| 123 | void Capture(const media::AudioBus* audio_bus, |
Jeroen Dhollander | 6be574b | 2019-07-17 18:48:23 | [diff] [blame] | 124 | base::TimeTicks audio_capture_time, |
Fredrik Hernqvist | 35a8135 | 2024-03-04 15:03:55 | [diff] [blame] | 125 | const media::AudioGlitchInfo& glitch_info, |
Fredrik Hernqvist | 590f6576 | 2024-11-14 13:42:15 | [diff] [blame] | 126 | double volume) final; |
Tony Herre | 003731d | 2021-05-25 22:09:56 | [diff] [blame] | 127 | void OnCaptureError(media::AudioCapturerSource::ErrorCode code, |
| 128 | const std::string& message) final; |
Marina Ciocea | 68e948e | 2018-05-10 22:26:17 | [diff] [blame] | 129 | void OnCaptureMuted(bool is_muted) final {} |
tommi | ce9a251 | 2017-01-16 18:35:18 | [diff] [blame] | 130 | |
Evan Liu | 881ab7a | 2024-08-01 21:54:51 | [diff] [blame] | 131 | // media::mojom::blink::SpeechRecognitionAudioForwarder methods. |
| 132 | void AddAudioFromRenderer(media::mojom::AudioDataS16Ptr buffer) override; |
| 133 | |
[email protected] | ce1adc34 | 2013-05-20 13:35:43 | [diff] [blame] | 134 | // SpeechRecognitionEngineDelegate methods. |
dcheng | c2282aa | 2014-10-21 12:07:58 | [diff] [blame] | 135 | void OnSpeechRecognitionEngineResults( |
Evan Liu | d725228 | 2024-05-16 20:10:32 | [diff] [blame] | 136 | const std::vector<media::mojom::WebSpeechRecognitionResultPtr>& results) |
Adithya Srinivasan | c35bf396 | 2018-06-12 14:28:14 | [diff] [blame] | 137 | override; |
gshires | 78afd3eb | 2015-09-26 01:32:56 | [diff] [blame] | 138 | void OnSpeechRecognitionEngineEndOfUtterance() override; |
dcheng | c2282aa | 2014-10-21 12:07:58 | [diff] [blame] | 139 | void OnSpeechRecognitionEngineError( |
Evan Liu | d725228 | 2024-05-16 20:10:32 | [diff] [blame] | 140 | const media::mojom::SpeechRecognitionError& error) override; |
[email protected] | ce1adc34 | 2013-05-20 13:35:43 | [diff] [blame] | 141 | |
olka | ef762c9 | 2017-02-06 16:45:16 | [diff] [blame] | 142 | media::AudioSystem* GetAudioSystem(); |
Marina Ciocea | 68e948e | 2018-05-10 22:26:17 | [diff] [blame] | 143 | void CreateAudioCapturerSource(); |
| 144 | media::AudioCapturerSource* GetAudioCapturerSource(); |
[email protected] | ce1adc34 | 2013-05-20 13:35:43 | [diff] [blame] | 145 | |
Marina Ciocea | 68e948e | 2018-05-10 22:26:17 | [diff] [blame] | 146 | // Substitute the real audio system and capturer source in browser tests. |
olka | ef762c9 | 2017-02-06 16:45:16 | [diff] [blame] | 147 | static media::AudioSystem* audio_system_for_tests_; |
Marina Ciocea | 68e948e | 2018-05-10 22:26:17 | [diff] [blame] | 148 | static media::AudioCapturerSource* audio_capturer_source_for_tests_; |
| 149 | |
Arthur Sonzogni | e98d214 | 2023-06-01 15:02:25 | [diff] [blame] | 150 | raw_ptr<media::AudioSystem, DanglingUntriaged> audio_system_; |
dcheng | 5971627 | 2016-04-09 05:19:08 | [diff] [blame] | 151 | std::unique_ptr<SpeechRecognitionEngine> recognition_engine_; |
Evan Liu | 881ab7a | 2024-08-01 21:54:51 | [diff] [blame] | 152 | int sample_rate_; |
Evan Liu | 90a83152 | 2024-05-20 18:20:02 | [diff] [blame] | 153 | speech::Endpointer endpointer_; |
Marina Ciocea | 68e948e | 2018-05-10 22:26:17 | [diff] [blame] | 154 | scoped_refptr<media::AudioCapturerSource> audio_capturer_source_; |
[email protected] | ce1adc34 | 2013-05-20 13:35:43 | [diff] [blame] | 155 | int num_samples_recorded_; |
| 156 | float audio_level_; |
[email protected] | 6073144 | 2014-01-16 04:59:59 | [diff] [blame] | 157 | bool provisional_results_; |
gshires | 78afd3eb | 2015-09-26 01:32:56 | [diff] [blame] | 158 | bool end_of_utterance_; |
[email protected] | 76f9f04e | 2013-06-20 06:38:23 | [diff] [blame] | 159 | std::string device_id_; |
Evan Liu | 881ab7a | 2024-08-01 21:54:51 | [diff] [blame] | 160 | media::AudioParameters audio_parameters_; |
| 161 | bool use_audio_capturer_source_ = true; |
| 162 | mojo::Receiver<media::mojom::SpeechRecognitionAudioForwarder> |
| 163 | audio_forwarder_receiver_; |
olka | ef762c9 | 2017-02-06 16:45:16 | [diff] [blame] | 164 | media::AudioParameters device_params_; |
[email protected] | ce1adc34 | 2013-05-20 13:35:43 | [diff] [blame] | 165 | |
[email protected] | 2e50f6d7 | 2013-06-17 14:41:38 | [diff] [blame] | 166 | class OnDataConverter; |
| 167 | |
| 168 | // Converts data between native input format and a WebSpeech specific |
| 169 | // output format. |
dcheng | 5971627 | 2016-04-09 05:19:08 | [diff] [blame] | 170 | std::unique_ptr<SpeechRecognizerImpl::OnDataConverter> audio_converter_; |
[email protected] | 2e50f6d7 | 2013-06-17 14:41:38 | [diff] [blame] | 171 | |
Jeremy Roman | 3bca4bf | 2019-07-11 03:41:25 | [diff] [blame] | 172 | base::WeakPtrFactory<SpeechRecognizerImpl> weak_ptr_factory_{this}; |
[email protected] | ce1adc34 | 2013-05-20 13:35:43 | [diff] [blame] | 173 | }; |
| 174 | |
| 175 | } // namespace content |
| 176 | |
| 177 | #endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_ |