content/browser/speech/speech_recognition_manager_impl.h - chromium/src - Git at Google

 // Copyright 2012 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_MANAGER_IMPL_H_
 #define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_MANAGER_IMPL_H_

 #include <memory>
 #include <optional>

 #include "base/containers/flat_map.h"
 #include "base/memory/raw_ptr.h"
 #include "base/memory/weak_ptr.h"
 #include "content/common/content_export.h"
 #include "content/public/browser/browser_thread.h"
 #include "content/public/browser/speech_recognition_event_listener.h"
 #include "content/public/browser/speech_recognition_manager.h"
 #include "content/public/browser/speech_recognition_session_config.h"
 #include "content/public/browser/speech_recognition_session_context.h"
 #include "media/mojo/mojom/speech_recognition.mojom.h"
 #include "media/mojo/mojom/speech_recognition_error.mojom.h"
 #include "mojo/public/cpp/bindings/pending_receiver.h"
 #include "mojo/public/cpp/bindings/remote.h"
 #include "third_party/blink/public/mojom/mediastream/media_stream.mojom-forward.h"

 namespace media {
 class AudioSystem;
 }

 namespace content {
 class BrowserMainLoop;
 class MediaStreamManager;
 class MediaStreamUIProxy;
 class SpeechRecognitionManagerDelegate;
 class SpeechRecognizer;

 // This is the manager for speech recognition. It is a single instance in
 // the browser process and can serve several requests. Each recognition request
 // corresponds to a session, initiated via |CreateSession|.
 //
 // In any moment, the manager has at most a single session using the microphone
 // known as the, |microphone_session_id_|. This is the session that is capturing
 // audio, waiting for user permission, etc. There may also be other,
 // non-primary, sessions living in parallel that are waiting for results but not
 // recording audio.
 //
 // The SpeechRecognitionManager has the following responsibilities:
 //  - Handles requests received from various render frames and makes sure only
 //    one of them accesses the audio device at any given time.
 //  - Handles the instantiation of NetworkSpeechRecognitionEngineImpl objects
 //    when requested by SpeechRecognitionSessions.
 //  - Relays recognition results/status/error events of each session to the
 //    corresponding listener (demuxing on the base of their session_id).
 //  - Relays also recognition results/status/error events of every session to
 //    the catch-all snoop listener (optionally) provided by the delegate.
 class CONTENT_EXPORT SpeechRecognitionManagerImpl
     : public SpeechRecognitionManager,
       public SpeechRecognitionEventListener {
  public:
   // Returns the current SpeechRecognitionManagerImpl or NULL if the call is
   // issued when it is not created yet or destroyed (by BrowserMainLoop).
   static SpeechRecognitionManagerImpl* GetInstance();

   static bool IsOnDeviceSpeechRecognitionInstalled(
       const SpeechRecognitionSessionConfig& config);

   // SpeechRecognitionManager implementation.
   int CreateSession(const SpeechRecognitionSessionConfig& config) override;
   int CreateSession(
       const SpeechRecognitionSessionConfig& config,
       mojo::PendingReceiver<media::mojom::SpeechRecognitionSession>
           session_receiver,
       mojo::PendingRemote<media::mojom::SpeechRecognitionSessionClient>
           client_remote,
       std::optional<SpeechRecognitionAudioForwarderConfig>
           audio_forwarder_config) override;
   void StartSession(int session_id) override;
   void AbortSession(int session_id) override;
   void AbortAllSessionsForRenderFrame(int render_process_id,
                                       int render_frame_id) override;
   void StopAudioCaptureForSession(int session_id) override;
   void UpdateRecognitionContextForSession(
       int session_id,
       const media::SpeechRecognitionRecognitionContext& recognition_context)
       override;
   const SpeechRecognitionSessionConfig& GetSessionConfig(
       int session_id) override;
   SpeechRecognitionSessionContext GetSessionContext(int session_id) override;
   bool UseOnDeviceSpeechRecognition(
       const SpeechRecognitionSessionConfig& config) override;

   // SpeechRecognitionEventListener methods.
   void OnRecognitionStart(int session_id) override;
   void OnAudioStart(int session_id) override;
   void OnSoundStart(int session_id) override;
   void OnSoundEnd(int session_id) override;
   void OnAudioEnd(int session_id) override;
   void OnRecognitionEnd(int session_id) override;
   void OnRecognitionResults(
       int session_id,
       const std::vector<media::mojom::WebSpeechRecognitionResultPtr>& result)
       override;
   void OnRecognitionError(
       int session_id,
       const media::mojom::SpeechRecognitionError& error) override;
   void OnAudioLevelsChange(int session_id,
                            float volume,
                            float noise_volume) override;

   int CreateSession(
       const SpeechRecognitionSessionConfig& config,
       mojo::PendingReceiver<media::mojom::SpeechRecognitionSession>
           session_receiver,
       mojo::PendingRemote<media::mojom::SpeechRecognitionSessionClient>
           client_remote,
       std::optional<SpeechRecognitionAudioForwarderConfig>
           audio_forwarder_config,
       bool can_render_frame_use_on_device);

   SpeechRecognitionManagerDelegate* delegate() const { return delegate_.get(); }

  protected:
   // Only BrowserMainLoop and tests are allowed to instantiate this class.
   friend class BrowserMainLoop;
   friend class SpeechRecognitionManagerImplTest;

   // Needed for deletion on the IO thread.
   friend std::default_delete<SpeechRecognitionManagerImpl>;
   friend class base::DeleteHelper<content::SpeechRecognitionManagerImpl>;

   SpeechRecognitionManagerImpl(media::AudioSystem* audio_system,
                                MediaStreamManager* media_stream_manager);
   ~SpeechRecognitionManagerImpl() override;

  private:

   // Data types for the internal Finite State Machine (FSM).
   enum FSMState {
     SESSION_STATE_IDLE = 0,
     SESSION_STATE_CAPTURING_AUDIO,
     SESSION_STATE_WAITING_FOR_RESULT,
     SESSION_STATE_MAX_VALUE = SESSION_STATE_WAITING_FOR_RESULT
   };

   enum FSMEvent {
     EVENT_ABORT = 0,
     EVENT_START,
     EVENT_UPDATE_RECOGNITION_CONTEXT,
     EVENT_STOP_CAPTURE,
     EVENT_AUDIO_ENDED,
     EVENT_RECOGNITION_ENDED,
     EVENT_MAX_VALUE = EVENT_RECOGNITION_ENDED
   };

   struct Session {
     Session();
     ~Session();

     int id;
     bool abort_requested;
     SpeechRecognitionSessionConfig config;
     SpeechRecognitionSessionContext context;
     scoped_refptr<SpeechRecognizer> recognizer;
     std::unique_ptr<MediaStreamUIProxy> ui;
     bool use_microphone;
     media::SpeechRecognitionRecognitionContext recognition_context;
   };

   void AbortSessionImpl(int session_id);

   // Callback issued by the SpeechRecognitionManagerDelegate for reporting
   // asynchronously the result of the CheckRecognitionIsAllowed call.
   void RecognitionAllowedCallback(int session_id,
                                   bool ask_user,
                                   bool is_allowed);

   // Callback to get back the result of a media request. |devices| is an array
   // of devices approved to be used for the request, |devices| is empty if the
   // users deny the request.
   void MediaRequestPermissionCallback(
       int session_id,
       const blink::mojom::StreamDevicesSet& stream_devices_set,
       std::unique_ptr<MediaStreamUIProxy> stream_ui);

   // Entry point for pushing any external event into the session handling FSM.
   void DispatchEvent(int session_id, FSMEvent event);

   // Defines the behavior of the session handling FSM, selecting the appropriate
   // transition according to the session, its current state and the event.
   void ExecuteTransitionAndGetNextState(Session* session,
                                         FSMState session_state,
                                         FSMEvent event);

   // Retrieves the state of the session, enquiring directly the recognizer.
   FSMState GetSessionState(int session_id) const;

   // The methods below handle transitions of the session handling FSM.
   void SessionStart(const Session& session);
   void SessionUpdateRecognitionContext(const Session& session);
   void SessionAbort(const Session& session);
   void SessionStopAudioCapture(const Session& session);
   void ResetCapturingSessionId(const Session& session);
   void SessionDelete(Session* session);
   void NotFeasible(const Session& session, FSMEvent event);

   bool SessionExists(int session_id) const;
   Session* GetSession(int session_id) const;
   SpeechRecognitionEventListener* GetListener(int session_id) const;
   SpeechRecognitionEventListener* GetDelegateListener() const;
   int GetNextSessionID();

   static int next_requester_id_;

   raw_ptr<media::AudioSystem> audio_system_;
   raw_ptr<MediaStreamManager> media_stream_manager_;
   base::flat_map<int, std::unique_ptr<Session>> sessions_;
   int microphone_session_id_ = kSessionIDInvalid;
   int last_session_id_ = kSessionIDInvalid;
   bool is_dispatching_event_ = false;
   std::unique_ptr<SpeechRecognitionManagerDelegate> delegate_;
   const int requester_id_;

   mojo::Remote<media::mojom::SpeechRecognitionContext>
       speech_recognition_context_;

   // Used for posting asynchronous tasks (on the IO thread) without worrying
   // about this class being destroyed in the meanwhile (due to browser shutdown)
   // since tasks pending on a destroyed WeakPtr are automatically discarded.
   base::WeakPtrFactory<SpeechRecognitionManagerImpl> weak_factory_{this};
 };

 }  // namespace content

 #endif  // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_MANAGER_IMPL_H_
	// Copyright 2012 The Chromium Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_MANAGER_IMPL_H_
	#define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_MANAGER_IMPL_H_

	#include <memory>
	#include <optional>

	#include "base/containers/flat_map.h"
	#include "base/memory/raw_ptr.h"
	#include "base/memory/weak_ptr.h"
	#include "content/common/content_export.h"
	#include "content/public/browser/browser_thread.h"
	#include "content/public/browser/speech_recognition_event_listener.h"
	#include "content/public/browser/speech_recognition_manager.h"
	#include "content/public/browser/speech_recognition_session_config.h"
	#include "content/public/browser/speech_recognition_session_context.h"
	#include "media/mojo/mojom/speech_recognition.mojom.h"
	#include "media/mojo/mojom/speech_recognition_error.mojom.h"
	#include "mojo/public/cpp/bindings/pending_receiver.h"
	#include "mojo/public/cpp/bindings/remote.h"
	#include "third_party/blink/public/mojom/mediastream/media_stream.mojom-forward.h"

	namespace media {
	class AudioSystem;
	}

	namespace content {
	class BrowserMainLoop;
	class MediaStreamManager;
	class MediaStreamUIProxy;
	class SpeechRecognitionManagerDelegate;
	class SpeechRecognizer;

	// This is the manager for speech recognition. It is a single instance in
	// the browser process and can serve several requests. Each recognition request
	// corresponds to a session, initiated via \|CreateSession\|.
	//
	// In any moment, the manager has at most a single session using the microphone
	// known as the, \|microphone_session_id_\|. This is the session that is capturing
	// audio, waiting for user permission, etc. There may also be other,
	// non-primary, sessions living in parallel that are waiting for results but not
	// recording audio.
	//
	// The SpeechRecognitionManager has the following responsibilities:
	// - Handles requests received from various render frames and makes sure only
	// one of them accesses the audio device at any given time.
	// - Handles the instantiation of NetworkSpeechRecognitionEngineImpl objects
	// when requested by SpeechRecognitionSessions.
	// - Relays recognition results/status/error events of each session to the
	// corresponding listener (demuxing on the base of their session_id).
	// - Relays also recognition results/status/error events of every session to
	// the catch-all snoop listener (optionally) provided by the delegate.
	class CONTENT_EXPORT SpeechRecognitionManagerImpl
	: public SpeechRecognitionManager,
	public SpeechRecognitionEventListener {
	public:
	// Returns the current SpeechRecognitionManagerImpl or NULL if the call is
	// issued when it is not created yet or destroyed (by BrowserMainLoop).
	static SpeechRecognitionManagerImpl* GetInstance();

	static bool IsOnDeviceSpeechRecognitionInstalled(
	const SpeechRecognitionSessionConfig& config);

	// SpeechRecognitionManager implementation.
	int CreateSession(const SpeechRecognitionSessionConfig& config) override;
	int CreateSession(
	const SpeechRecognitionSessionConfig& config,
	mojo::PendingReceiver<media::mojom::SpeechRecognitionSession>
	session_receiver,
	mojo::PendingRemote<media::mojom::SpeechRecognitionSessionClient>
	client_remote,
	std::optional<SpeechRecognitionAudioForwarderConfig>
	audio_forwarder_config) override;
	void StartSession(int session_id) override;
	void AbortSession(int session_id) override;
	void AbortAllSessionsForRenderFrame(int render_process_id,
	int render_frame_id) override;
	void StopAudioCaptureForSession(int session_id) override;
	void UpdateRecognitionContextForSession(
	int session_id,
	const media::SpeechRecognitionRecognitionContext& recognition_context)
	override;
	const SpeechRecognitionSessionConfig& GetSessionConfig(
	int session_id) override;
	SpeechRecognitionSessionContext GetSessionContext(int session_id) override;
	bool UseOnDeviceSpeechRecognition(
	const SpeechRecognitionSessionConfig& config) override;

	// SpeechRecognitionEventListener methods.
	void OnRecognitionStart(int session_id) override;
	void OnAudioStart(int session_id) override;
	void OnSoundStart(int session_id) override;
	void OnSoundEnd(int session_id) override;
	void OnAudioEnd(int session_id) override;
	void OnRecognitionEnd(int session_id) override;
	void OnRecognitionResults(
	int session_id,
	const std::vector<media::mojom::WebSpeechRecognitionResultPtr>& result)
	override;
	void OnRecognitionError(
	int session_id,
	const media::mojom::SpeechRecognitionError& error) override;
	void OnAudioLevelsChange(int session_id,
	float volume,
	float noise_volume) override;

	int CreateSession(
	const SpeechRecognitionSessionConfig& config,
	mojo::PendingReceiver<media::mojom::SpeechRecognitionSession>
	session_receiver,
	mojo::PendingRemote<media::mojom::SpeechRecognitionSessionClient>
	client_remote,
	std::optional<SpeechRecognitionAudioForwarderConfig>
	audio_forwarder_config,
	bool can_render_frame_use_on_device);

	SpeechRecognitionManagerDelegate* delegate() const { return delegate_.get(); }

	protected:
	// Only BrowserMainLoop and tests are allowed to instantiate this class.
	friend class BrowserMainLoop;
	friend class SpeechRecognitionManagerImplTest;

	// Needed for deletion on the IO thread.
	friend std::default_delete<SpeechRecognitionManagerImpl>;
	friend class base::DeleteHelper<content::SpeechRecognitionManagerImpl>;

	SpeechRecognitionManagerImpl(media::AudioSystem* audio_system,
	MediaStreamManager* media_stream_manager);
	~SpeechRecognitionManagerImpl() override;

	private:

	// Data types for the internal Finite State Machine (FSM).
	enum FSMState {
	SESSION_STATE_IDLE = 0,
	SESSION_STATE_CAPTURING_AUDIO,
	SESSION_STATE_WAITING_FOR_RESULT,
	SESSION_STATE_MAX_VALUE = SESSION_STATE_WAITING_FOR_RESULT
	};

	enum FSMEvent {
	EVENT_ABORT = 0,
	EVENT_START,
	EVENT_UPDATE_RECOGNITION_CONTEXT,
	EVENT_STOP_CAPTURE,
	EVENT_AUDIO_ENDED,
	EVENT_RECOGNITION_ENDED,
	EVENT_MAX_VALUE = EVENT_RECOGNITION_ENDED
	};

	struct Session {
	Session();
	~Session();

	int id;
	bool abort_requested;
	SpeechRecognitionSessionConfig config;
	SpeechRecognitionSessionContext context;
	scoped_refptr<SpeechRecognizer> recognizer;
	std::unique_ptr<MediaStreamUIProxy> ui;
	bool use_microphone;
	media::SpeechRecognitionRecognitionContext recognition_context;
	};

	void AbortSessionImpl(int session_id);

	// Callback issued by the SpeechRecognitionManagerDelegate for reporting
	// asynchronously the result of the CheckRecognitionIsAllowed call.
	void RecognitionAllowedCallback(int session_id,
	bool ask_user,
	bool is_allowed);

	// Callback to get back the result of a media request. \|devices\| is an array
	// of devices approved to be used for the request, \|devices\| is empty if the
	// users deny the request.
	void MediaRequestPermissionCallback(
	int session_id,
	const blink::mojom::StreamDevicesSet& stream_devices_set,
	std::unique_ptr<MediaStreamUIProxy> stream_ui);

	// Entry point for pushing any external event into the session handling FSM.
	void DispatchEvent(int session_id, FSMEvent event);

	// Defines the behavior of the session handling FSM, selecting the appropriate
	// transition according to the session, its current state and the event.
	void ExecuteTransitionAndGetNextState(Session* session,
	FSMState session_state,
	FSMEvent event);

	// Retrieves the state of the session, enquiring directly the recognizer.
	FSMState GetSessionState(int session_id) const;

	// The methods below handle transitions of the session handling FSM.
	void SessionStart(const Session& session);
	void SessionUpdateRecognitionContext(const Session& session);
	void SessionAbort(const Session& session);
	void SessionStopAudioCapture(const Session& session);
	void ResetCapturingSessionId(const Session& session);
	void SessionDelete(Session* session);
	void NotFeasible(const Session& session, FSMEvent event);

	bool SessionExists(int session_id) const;
	Session* GetSession(int session_id) const;
	SpeechRecognitionEventListener* GetListener(int session_id) const;
	SpeechRecognitionEventListener* GetDelegateListener() const;
	int GetNextSessionID();

	static int next_requester_id_;

	raw_ptr<media::AudioSystem> audio_system_;
	raw_ptr<MediaStreamManager> media_stream_manager_;
	base::flat_map<int, std::unique_ptr<Session>> sessions_;
	int microphone_session_id_ = kSessionIDInvalid;
	int last_session_id_ = kSessionIDInvalid;
	bool is_dispatching_event_ = false;
	std::unique_ptr<SpeechRecognitionManagerDelegate> delegate_;
	const int requester_id_;

	mojo::Remote<media::mojom::SpeechRecognitionContext>
	speech_recognition_context_;

	// Used for posting asynchronous tasks (on the IO thread) without worrying
	// about this class being destroyed in the meanwhile (due to browser shutdown)
	// since tasks pending on a destroyed WeakPtr are automatically discarded.
	base::WeakPtrFactory<SpeechRecognitionManagerImpl> weak_factory_{this};
	};

	} // namespace content

	#endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_MANAGER_IMPL_H_