blob: 2bbd6d5d58eb60c8d6e12e6b93eb74ee615c83da [file] [log] [blame]
Avi Drissman4e1b7bc32022-09-15 14:03:501// Copyright 2013 The Chromium Authors
[email protected]ce1adc342013-05-20 13:35:432// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_
6#define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_
7
dcheng59716272016-04-09 05:19:088#include <memory>
maxmorin3500cac2017-02-03 10:14:279#include <string>
dcheng59716272016-04-09 05:19:0810
Keishi Hattori0e45c022021-11-27 09:25:5211#include "base/memory/raw_ptr.h"
olkaef762c92017-02-06 16:45:1612#include "base/memory/weak_ptr.h"
Evan Liu90a831522024-05-20 18:20:0213#include "components/speech/endpointer/endpointer.h"
Evan Liu1f458242024-07-24 22:12:2014#include "components/speech/speech_recognizer_fsm.h"
[email protected]ce1adc342013-05-20 13:35:4315#include "content/browser/speech/speech_recognition_engine.h"
16#include "content/browser/speech/speech_recognizer.h"
Lei Zhang7ab313752021-11-17 01:26:0017#include "content/common/content_export.h"
Marina Ciocea68e948e2018-05-10 22:26:1718#include "media/base/audio_capturer_source.h"
Evan Liu881ab7a2024-08-01 21:54:5119#include "media/mojo/mojom/audio_data.mojom.h"
20#include "media/mojo/mojom/speech_recognition.mojom.h"
21#include "media/mojo/mojom/speech_recognition_audio_forwarder.mojom.h"
Evan Liud7252282024-05-16 20:10:3222#include "media/mojo/mojom/speech_recognition_error.mojom.h"
23#include "media/mojo/mojom/speech_recognition_result.mojom.h"
Evan Liu881ab7a2024-08-01 21:54:5124#include "media/mojo/mojom/speech_recognizer.mojom.h"
25#include "mojo/public/cpp/bindings/receiver.h"
[email protected]ce1adc342013-05-20 13:35:4326
27namespace media {
[email protected]f361a0d2014-06-19 12:38:5628class AudioBus;
olkaef762c92017-02-06 16:45:1629class AudioSystem;
Guido Urdaneta35a86982018-02-20 13:34:3830} // namespace media
[email protected]ce1adc342013-05-20 13:35:4331
32namespace content {
33
34class SpeechRecognitionEventListener;
Evan Liu881ab7a2024-08-01 21:54:5135struct SpeechRecognitionAudioForwarderConfig;
[email protected]ce1adc342013-05-20 13:35:4336
37// Handles speech recognition for a session (identified by |session_id|), taking
38// care of audio capture, silence detection/endpointer and interaction with the
39// SpeechRecognitionEngine.
40class CONTENT_EXPORT SpeechRecognizerImpl
41 : public SpeechRecognizer,
Marina Ciocea68e948e2018-05-10 22:26:1742 public media::AudioCapturerSource::CaptureCallback,
Evan Liu1f458242024-07-24 22:12:2043 public SpeechRecognitionEngine::Delegate,
Evan Liu881ab7a2024-08-01 21:54:5144 public speech::SpeechRecognizerFsm,
45 public media::mojom::SpeechRecognitionAudioForwarder {
[email protected]ce1adc342013-05-20 13:35:4346 public:
Raul Tambreb1da2442019-04-07 18:18:0347 static constexpr int kAudioSampleRate = 16000;
48 static constexpr media::ChannelLayout kChannelLayout =
49 media::CHANNEL_LAYOUT_MONO;
50 static constexpr int kNumBitsPerAudioSample = 16;
51 static constexpr int kNoSpeechTimeoutMs = 8000;
52 static constexpr int kEndpointerEstimationTimeMs = 300;
[email protected]ce1adc342013-05-20 13:35:4353
Marina Ciocea68e948e2018-05-10 22:26:1754 static void SetAudioEnvironmentForTesting(
55 media::AudioSystem* audio_system,
56 media::AudioCapturerSource* capturer_source);
[email protected]ce1adc342013-05-20 13:35:4357
58 SpeechRecognizerImpl(SpeechRecognitionEventListener* listener,
olkaef762c92017-02-06 16:45:1659 media::AudioSystem* audio_system,
[email protected]ce1adc342013-05-20 13:35:4360 int session_id,
[email protected]60731442014-01-16 04:59:5961 bool continuous,
62 bool provisional_results,
Evan Liu881ab7a2024-08-01 21:54:5163 std::unique_ptr<SpeechRecognitionEngine> engine,
64 std::optional<SpeechRecognitionAudioForwarderConfig>
65 audio_forwarder_config);
[email protected]ce1adc342013-05-20 13:35:4366
Peter Boström9b036532021-10-28 23:37:2867 SpeechRecognizerImpl(const SpeechRecognizerImpl&) = delete;
68 SpeechRecognizerImpl& operator=(const SpeechRecognizerImpl&) = delete;
69
Marina Ciocea68e948e2018-05-10 22:26:1770 // SpeechRecognizer methods.
dchengc2282aa2014-10-21 12:07:5871 void StartRecognition(const std::string& device_id) override;
Yiren Wang23998562025-01-28 21:31:0572 void UpdateRecognitionContext(
73 const media::SpeechRecognitionRecognitionContext& recognition_context)
74 override;
dchengc2282aa2014-10-21 12:07:5875 void AbortRecognition() override;
76 void StopAudioCapture() override;
77 bool IsActive() const override;
78 bool IsCapturingAudio() const override;
Marina Ciocea68e948e2018-05-10 22:26:1779
[email protected]ce1adc342013-05-20 13:35:4380 const SpeechRecognitionEngine& recognition_engine() const;
81
82 private:
83 friend class SpeechRecognizerTest;
84
dchengc2282aa2014-10-21 12:07:5885 ~SpeechRecognizerImpl() override;
[email protected]ce1adc342013-05-20 13:35:4386
olkaef762c92017-02-06 16:45:1687 // Callback from AudioSystem.
Evan Liu881ab7a2024-08-01 21:54:5188 void OnAudioParametersReceived(
89 const std::optional<media::AudioParameters>& params);
olkaef762c92017-02-06 16:45:1690
Evan Liu1f458242024-07-24 22:12:2091 // speech::SpeechRecognizerFsm implementation.
92 // Process a new audio chunk in the audio pipeline (endpointer, vumeter, etc).
93 void DispatchEvent(const FSMEventArgs& event_args) override;
94 void ProcessAudioPipeline(const FSMEventArgs& event_args) override;
95 FSMState PrepareRecognition(const FSMEventArgs&) override;
96 FSMState StartRecording(const FSMEventArgs& event_args) override;
97 FSMState StartRecognitionEngine(const FSMEventArgs& event_args) override;
98 FSMState WaitEnvironmentEstimationCompletion(
99 const FSMEventArgs& event_args) override;
100 FSMState DetectUserSpeechOrTimeout(const FSMEventArgs& event_args) override;
101 FSMState StopCaptureAndWaitForResult(const FSMEventArgs& event_args) override;
102 FSMState ProcessIntermediateResult(const FSMEventArgs& event_args) override;
103 FSMState ProcessFinalResult(const FSMEventArgs& event_args) override;
104 FSMState AbortSilently(const FSMEventArgs& event_args) override;
105 FSMState AbortWithError(const FSMEventArgs& event_args) override;
106 FSMState Abort(const media::mojom::SpeechRecognitionError& error) override;
107 FSMState DetectEndOfSpeech(const FSMEventArgs& event_args) override;
Yiren Wang23998562025-01-28 21:31:05108 FSMState UpdateRecognitionContext(const FSMEventArgs& event_args) override;
Evan Liu1f458242024-07-24 22:12:20109 FSMState DoNothing(const FSMEventArgs& event_args) const override;
110 FSMState NotFeasible(const FSMEventArgs& event_args) override;
[email protected]ce1adc342013-05-20 13:35:43111
112 // Returns the time span of captured audio samples since the start of capture.
113 int GetElapsedTimeMs() const;
114
115 // Calculates the input volume to be displayed in the UI, triggering the
116 // OnAudioLevelsChange event accordingly.
117 void UpdateSignalAndNoiseLevels(const float& rms, bool clip_detected);
118
Marina Ciocea68e948e2018-05-10 22:26:17119 void CloseAudioCapturerSource();
[email protected]ce1adc342013-05-20 13:35:43120
Marina Ciocea68e948e2018-05-10 22:26:17121 // media::AudioCapturerSource::CaptureCallback methods.
122 void OnCaptureStarted() final {}
123 void Capture(const media::AudioBus* audio_bus,
Jeroen Dhollander6be574b2019-07-17 18:48:23124 base::TimeTicks audio_capture_time,
Fredrik Hernqvist35a81352024-03-04 15:03:55125 const media::AudioGlitchInfo& glitch_info,
Fredrik Hernqvist590f65762024-11-14 13:42:15126 double volume) final;
Tony Herre003731d2021-05-25 22:09:56127 void OnCaptureError(media::AudioCapturerSource::ErrorCode code,
128 const std::string& message) final;
Marina Ciocea68e948e2018-05-10 22:26:17129 void OnCaptureMuted(bool is_muted) final {}
tommice9a2512017-01-16 18:35:18130
Evan Liu881ab7a2024-08-01 21:54:51131 // media::mojom::blink::SpeechRecognitionAudioForwarder methods.
132 void AddAudioFromRenderer(media::mojom::AudioDataS16Ptr buffer) override;
133
[email protected]ce1adc342013-05-20 13:35:43134 // SpeechRecognitionEngineDelegate methods.
dchengc2282aa2014-10-21 12:07:58135 void OnSpeechRecognitionEngineResults(
Evan Liud7252282024-05-16 20:10:32136 const std::vector<media::mojom::WebSpeechRecognitionResultPtr>& results)
Adithya Srinivasanc35bf3962018-06-12 14:28:14137 override;
gshires78afd3eb2015-09-26 01:32:56138 void OnSpeechRecognitionEngineEndOfUtterance() override;
dchengc2282aa2014-10-21 12:07:58139 void OnSpeechRecognitionEngineError(
Evan Liud7252282024-05-16 20:10:32140 const media::mojom::SpeechRecognitionError& error) override;
[email protected]ce1adc342013-05-20 13:35:43141
olkaef762c92017-02-06 16:45:16142 media::AudioSystem* GetAudioSystem();
Marina Ciocea68e948e2018-05-10 22:26:17143 void CreateAudioCapturerSource();
144 media::AudioCapturerSource* GetAudioCapturerSource();
[email protected]ce1adc342013-05-20 13:35:43145
Marina Ciocea68e948e2018-05-10 22:26:17146 // Substitute the real audio system and capturer source in browser tests.
olkaef762c92017-02-06 16:45:16147 static media::AudioSystem* audio_system_for_tests_;
Marina Ciocea68e948e2018-05-10 22:26:17148 static media::AudioCapturerSource* audio_capturer_source_for_tests_;
149
Arthur Sonzognie98d2142023-06-01 15:02:25150 raw_ptr<media::AudioSystem, DanglingUntriaged> audio_system_;
dcheng59716272016-04-09 05:19:08151 std::unique_ptr<SpeechRecognitionEngine> recognition_engine_;
Evan Liu881ab7a2024-08-01 21:54:51152 int sample_rate_;
Evan Liu90a831522024-05-20 18:20:02153 speech::Endpointer endpointer_;
Marina Ciocea68e948e2018-05-10 22:26:17154 scoped_refptr<media::AudioCapturerSource> audio_capturer_source_;
[email protected]ce1adc342013-05-20 13:35:43155 int num_samples_recorded_;
156 float audio_level_;
[email protected]60731442014-01-16 04:59:59157 bool provisional_results_;
gshires78afd3eb2015-09-26 01:32:56158 bool end_of_utterance_;
[email protected]76f9f04e2013-06-20 06:38:23159 std::string device_id_;
Evan Liu881ab7a2024-08-01 21:54:51160 media::AudioParameters audio_parameters_;
161 bool use_audio_capturer_source_ = true;
162 mojo::Receiver<media::mojom::SpeechRecognitionAudioForwarder>
163 audio_forwarder_receiver_;
olkaef762c92017-02-06 16:45:16164 media::AudioParameters device_params_;
[email protected]ce1adc342013-05-20 13:35:43165
[email protected]2e50f6d72013-06-17 14:41:38166 class OnDataConverter;
167
168 // Converts data between native input format and a WebSpeech specific
169 // output format.
dcheng59716272016-04-09 05:19:08170 std::unique_ptr<SpeechRecognizerImpl::OnDataConverter> audio_converter_;
[email protected]2e50f6d72013-06-17 14:41:38171
Jeremy Roman3bca4bf2019-07-11 03:41:25172 base::WeakPtrFactory<SpeechRecognizerImpl> weak_ptr_factory_{this};
[email protected]ce1adc342013-05-20 13:35:43173};
174
175} // namespace content
176
177#endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_