1- // Copyright 2018 Google LLC.
1+ // Copyright 2019 Google LLC.
22//
33// Licensed under the Apache License, Version 2.0 (the "License");
44// you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@ syntax = "proto3";
1818package google.cloud.texttospeech.v1 ;
1919
2020import "google/api/annotations.proto" ;
21+ import "google/api/client.proto" ;
22+ import "google/api/field_behavior.proto" ;
2123
2224option cc_enable_arenas = true ;
2325option csharp_namespace = "Google.Cloud.TextToSpeech.V1" ;
@@ -29,35 +31,79 @@ option php_namespace = "Google\\Cloud\\TextToSpeech\\V1";
2931
3032// Service that implements Google Cloud Text-to-Speech API.
3133service TextToSpeech {
34+ option (google.api.default_host ) = "texttospeech.googleapis.com" ;
35+ option (google.api.oauth_scopes ) = "https://www.googleapis.com/auth/cloud-platform" ;
36+
3237 // Returns a list of Voice supported for synthesis.
3338 rpc ListVoices (ListVoicesRequest ) returns (ListVoicesResponse ) {
3439 option (google.api.http ) = {
3540 get : "/v1/voices"
3641 };
42+ option (google.api.method_signature ) = "language_code" ;
3743 }
3844
3945 // Synthesizes speech synchronously: receive results after all text input
4046 // has been processed.
41- rpc SynthesizeSpeech (SynthesizeSpeechRequest )
42- returns (SynthesizeSpeechResponse ) {
47+ rpc SynthesizeSpeech (SynthesizeSpeechRequest ) returns (SynthesizeSpeechResponse ) {
4348 option (google.api.http ) = {
4449 post : "/v1/text:synthesize"
4550 body : "*"
4651 };
52+ option (google.api.method_signature ) = "input,voice,audio_config" ;
4753 }
4854}
4955
5056// The top-level message sent by the client for the `ListVoices` method.
5157message ListVoicesRequest {
52- // Optional (but recommended)
58+ // Optional. Recommended.
5359 // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. If
5460 // specified, the ListVoices call will only return voices that can be used to
5561 // synthesize this language_code. E.g. when specifying "en-NZ", you will get
5662 // supported "en-*" voices; when specifying "no", you will get supported
5763 // "no-*" (Norwegian) and "nb-*" (Norwegian Bokmal) voices; specifying "zh"
5864 // will also get supported "cmn-*" voices; specifying "zh-hk" will also get
5965 // supported "yue-*" voices.
60- string language_code = 1 ;
66+ string language_code = 1 [(google.api.field_behavior ) = OPTIONAL ];
67+ }
68+
69+ // Gender of the voice as described in
70+ // [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice).
71+ enum SsmlVoiceGender {
72+ // An unspecified gender.
73+ // In VoiceSelectionParams, this means that the client doesn't care which
74+ // gender the selected voice will have. In the Voice field of
75+ // ListVoicesResponse, this may mean that the voice doesn't fit any of the
76+ // other categories in this enum, or that the gender of the voice isn't known.
77+ SSML_VOICE_GENDER_UNSPECIFIED = 0 ;
78+
79+ // A male voice.
80+ MALE = 1 ;
81+
82+ // A female voice.
83+ FEMALE = 2 ;
84+
85+ // A gender-neutral voice.
86+ NEUTRAL = 3 ;
87+ }
88+
89+ // Configuration to set up audio encoder. The encoding determines the output
90+ // audio format that we'd like.
91+ enum AudioEncoding {
92+ // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
93+ AUDIO_ENCODING_UNSPECIFIED = 0 ;
94+
95+ // Uncompressed 16-bit signed little-endian samples (Linear PCM).
96+ // Audio content returned as LINEAR16 also contains a WAV header.
97+ LINEAR16 = 1 ;
98+
99+ // MP3 audio at 32kbps.
100+ MP3 = 2 ;
101+
102+ // Opus encoded audio wrapped in an ogg container. The result will be a
103+ // file which can be played natively on Android, and in browsers (at least
104+ // Chrome and Firefox). The quality of the encoding is considerably higher
105+ // than MP3 while using approximately the same bitrate.
106+ OGG_OPUS = 3 ;
61107}
62108
63109// The message returned to the client by the `ListVoices` method.
@@ -86,13 +132,13 @@ message Voice {
86132// The top-level message sent by the client for the `SynthesizeSpeech` method.
87133message SynthesizeSpeechRequest {
88134 // Required. The Synthesizer requires either plain text or SSML as input.
89- SynthesisInput input = 1 ;
135+ SynthesisInput input = 1 [ (google.api .field_behavior ) = REQUIRED ] ;
90136
91137 // Required. The desired voice of the synthesized audio.
92- VoiceSelectionParams voice = 2 ;
138+ VoiceSelectionParams voice = 2 [ (google.api .field_behavior ) = REQUIRED ] ;
93139
94140 // Required. The configuration of the synthesized audio.
95- AudioConfig audio_config = 3 ;
141+ AudioConfig audio_config = 3 [ (google.api .field_behavior ) = REQUIRED ] ;
96142}
97143
98144// Contains text input to be synthesized. Either `text` or `ssml` must be
@@ -115,9 +161,9 @@ message SynthesisInput {
115161
116162// Description of which voice to use for a synthesis request.
117163message VoiceSelectionParams {
118- // The language (and optionally also the region) of the voice expressed as a
164+ // Required. The language (and potentially also the region) of the voice expressed as a
119165 // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag, e.g.
120- // "en-US". Required. This should not include a script tag (e.g. use
166+ // "en-US". This should not include a script tag (e.g. use
121167 // "cmn-cn" rather than "cmn-Hant-cn"), because the script will be inferred
122168 // from the input provided in the SynthesisInput. The TTS service
123169 // will use this parameter to help choose an appropriate voice. Note that
@@ -126,13 +172,13 @@ message VoiceSelectionParams {
126172 // (e.g. using en-US rather than en-CA if there isn't a Canadian voice
127173 // available), or even a different language, e.g. using "nb" (Norwegian
128174 // Bokmal) instead of "no" (Norwegian)".
129- string language_code = 1 ;
175+ string language_code = 1 [ (google.api .field_behavior ) = REQUIRED ] ;
130176
131- // The name of the voice. Optional; if not set, the service will choose a
177+ // The name of the voice. If not set, the service will choose a
132178 // voice based on the other parameters such as language_code and gender.
133179 string name = 2 ;
134180
135- // The preferred gender of the voice. Optional; if not set, the service will
181+ // The preferred gender of the voice. If not set, the service will
136182 // choose a voice based on the other parameters such as language_code and
137183 // name. Note that this is only a preference, not requirement; if a
138184 // voice of the appropriate gender is not available, the synthesizer should
@@ -142,94 +188,66 @@ message VoiceSelectionParams {
142188
143189// Description of audio data to be synthesized.
144190message AudioConfig {
145- // Required. The format of the requested audio byte stream.
146- AudioEncoding audio_encoding = 1 ;
147-
148- // Optional speaking rate/speed, in the range [0.25, 4.0]. 1.0 is the normal
149- // native speed supported by the specific voice. 2.0 is twice as fast, and
150- // 0.5 is half as fast. If unset(0.0), defaults to the native 1.0 speed. Any
151- // other values < 0.25 or > 4.0 will return an error.
152- double speaking_rate = 2 ;
153-
154- // Optional speaking pitch, in the range [-20.0, 20.0]. 20 means increase 20
155- // semitones from the original pitch. -20 means decrease 20 semitones from the
156- // original pitch.
157- double pitch = 3 ;
158-
159- // Optional volume gain (in dB) of the normal native volume supported by the
160- // specific voice, in the range [-96.0, 16.0]. If unset, or set to a value of
161- // 0.0 (dB), will play at normal native signal amplitude. A value of -6.0 (dB)
162- // will play at approximately half the amplitude of the normal native signal
163- // amplitude. A value of +6.0 (dB) will play at approximately twice the
164- // amplitude of the normal native signal amplitude. Strongly recommend not to
165- // exceed +10 (dB) as there's usually no effective increase in loudness for
166- // any value greater than that.
167- double volume_gain_db = 4 ;
168-
169- // The synthesis sample rate (in hertz) for this audio. Optional. If this is
170- // different from the voice's natural sample rate, then the synthesizer will
171- // honor this request by converting to the desired sample rate (which might
172- // result in worse audio quality), unless the specified sample rate is not
173- // supported for the encoding chosen, in which case it will fail the request
174- // and return [google.rpc.Code.INVALID_ARGUMENT][].
175- int32 sample_rate_hertz = 5 ;
176-
177- // An identifier which selects 'audio effects' profiles that are applied on
178- // (post synthesized) text to speech.
179- // Effects are applied on top of each other in the order they are given.
180- // See
181- //
182- // [audio-profiles](https:
183- // //cloud.google.com/text-to-speech/docs/audio-profiles)
184- // for current supported profile ids.
185- repeated string effects_profile_id = 6 ;
191+ // Required. The format of the audio byte stream.
192+ AudioEncoding audio_encoding = 1 [(google.api.field_behavior ) = REQUIRED ];
193+
194+ // Optional. Input only. Speaking rate/speed, in the range [0.25, 4.0]. 1.0 is
195+ // the normal native speed supported by the specific voice. 2.0 is twice as
196+ // fast, and 0.5 is half as fast. If unset(0.0), defaults to the native 1.0
197+ // speed. Any other values < 0.25 or > 4.0 will return an error.
198+ double speaking_rate = 2 [
199+ (google.api.field_behavior ) = INPUT_ONLY ,
200+ (google.api.field_behavior ) = OPTIONAL
201+ ];
202+
203+ // Optional. Input only. Speaking pitch, in the range [-20.0, 20.0]. 20 means
204+ // increase 20 semitones from the original pitch. -20 means decrease 20
205+ // semitones from the original pitch.
206+ double pitch = 3 [
207+ (google.api.field_behavior ) = INPUT_ONLY ,
208+ (google.api.field_behavior ) = OPTIONAL
209+ ];
210+
211+ // Optional. Input only. Volume gain (in dB) of the normal native volume
212+ // supported by the specific voice, in the range [-96.0, 16.0]. If unset, or
213+ // set to a value of 0.0 (dB), will play at normal native signal amplitude. A
214+ // value of -6.0 (dB) will play at approximately half the amplitude of the
215+ // normal native signal amplitude. A value of +6.0 (dB) will play at
216+ // approximately twice the amplitude of the normal native signal amplitude.
217+ // Strongly recommend not to exceed +10 (dB) as there's usually no effective
218+ // increase in loudness for any value greater than that.
219+ double volume_gain_db = 4 [
220+ (google.api.field_behavior ) = INPUT_ONLY ,
221+ (google.api.field_behavior ) = OPTIONAL
222+ ];
223+
224+ // Optional. The synthesis sample rate (in hertz) for this audio. When this is
225+ // specified in SynthesizeSpeechRequest, if this is different from the voice's
226+ // natural sample rate, then the synthesizer will honor this request by
227+ // converting to the desired sample rate (which might result in worse audio
228+ // quality), unless the specified sample rate is not supported for the
229+ // encoding chosen, in which case it will fail the request and return
230+ // [google.rpc.Code.INVALID_ARGUMENT][].
231+ int32 sample_rate_hertz = 5 [(google.api.field_behavior ) = OPTIONAL ];
232+
233+ // Optional. Input only. An identifier which selects 'audio effects' profiles
234+ // that are applied on (post synthesized) text to speech. Effects are applied
235+ // on top of each other in the order they are given. See
236+ // [audio
237+ // profiles](https://cloud.google.com/text-to-speech/docs/audio-profiles) for
238+ // current supported profile ids.
239+ repeated string effects_profile_id = 6 [
240+ (google.api.field_behavior ) = INPUT_ONLY ,
241+ (google.api.field_behavior ) = OPTIONAL
242+ ];
186243}
187244
188245// The message returned to the client by the `SynthesizeSpeech` method.
189246message SynthesizeSpeechResponse {
190247 // The audio data bytes encoded as specified in the request, including the
191- // header (For LINEAR16 audio, we include the WAV header). Note: as
248+ // header for encodings that are wrapped in containers (e.g. MP3, OGG_OPUS).
249+ // For LINEAR16 audio, we include the WAV header. Note: as
192250 // with all bytes fields, protobuffers use a pure binary representation,
193251 // whereas JSON representations use base64.
194252 bytes audio_content = 1 ;
195253}
196-
197- // Gender of the voice as described in
198- // [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice).
199- enum SsmlVoiceGender {
200- // An unspecified gender.
201- // In VoiceSelectionParams, this means that the client doesn't care which
202- // gender the selected voice will have. In the Voice field of
203- // ListVoicesResponse, this may mean that the voice doesn't fit any of the
204- // other categories in this enum, or that the gender of the voice isn't known.
205- SSML_VOICE_GENDER_UNSPECIFIED = 0 ;
206-
207- // A male voice.
208- MALE = 1 ;
209-
210- // A female voice.
211- FEMALE = 2 ;
212-
213- // A gender-neutral voice.
214- NEUTRAL = 3 ;
215- }
216-
217- // Configuration to set up audio encoder. The encoding determines the output
218- // audio format that we'd like.
219- enum AudioEncoding {
220- // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
221- AUDIO_ENCODING_UNSPECIFIED = 0 ;
222-
223- // Uncompressed 16-bit signed little-endian samples (Linear PCM).
224- // Audio content returned as LINEAR16 also contains a WAV header.
225- LINEAR16 = 1 ;
226-
227- // MP3 audio.
228- MP3 = 2 ;
229-
230- // Opus encoded audio wrapped in an ogg container. The result will be a
231- // file which can be played natively on Android, and in browsers (at least
232- // Chrome and Firefox). The quality of the encoding is considerably higher
233- // than MP3 while using approximately the same bitrate.
234- OGG_OPUS = 3 ;
235- }
0 commit comments