1- // Copyright 2019 Google LLC.
1+ // Copyright 2018 Google LLC.
22//
33// Licensed under the Apache License, Version 2.0 (the "License");
44// you may not use this file except in compliance with the License.
@@ -22,7 +22,6 @@ import "google/longrunning/operations.proto";
2222import "google/protobuf/duration.proto" ;
2323import "google/protobuf/timestamp.proto" ;
2424import "google/rpc/status.proto" ;
25- import "google/api/client.proto" ;
2625
2726option csharp_namespace = "Google.Cloud.VideoIntelligence.V1" ;
2827option go_package = "google.golang.org/genproto/googleapis/cloud/videointelligence/v1;videointelligence" ;
@@ -34,14 +33,12 @@ option ruby_package = "Google::Cloud::VideoIntelligence::V1";
3433
3534// Service that implements Google Cloud Video Intelligence API.
3635service VideoIntelligenceService {
37- option (google.api.default_host ) = "videointelligence.googleapis.com" ;
38- option (google.api.oauth_scopes ) = "https://www.googleapis.com/auth/cloud-platform" ;
39-
4036 // Performs asynchronous video annotation. Progress and results can be
4137 // retrieved through the `google.longrunning.Operations` interface.
4238 // `Operation.metadata` contains `AnnotateVideoProgress` (progress).
4339 // `Operation.response` contains `AnnotateVideoResponse` (results).
44- rpc AnnotateVideo (AnnotateVideoRequest ) returns (google .longrunning .Operation ) {
40+ rpc AnnotateVideo (AnnotateVideoRequest )
41+ returns (google .longrunning .Operation ) {
4542 option (google.api.http ) = {
4643 post : "/v1/videos:annotate"
4744 body : "*"
@@ -55,10 +52,10 @@ message AnnotateVideoRequest {
5552 // [Google Cloud Storage](https://cloud.google.com/storage/) URIs are
5653 // supported, which must be specified in the following format:
5754 // `gs://bucket-id/object-id` (other URI formats return
58- // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
59- // [Request URIs](/storage/docs/reference-uris).
60- // A video URI may include wildcards in `object-id`, and thus identify
61- // multiple videos. Supported wildcards: '*' to match 0 or more characters;
55+ // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
56+ // more information, see [Request URIs](/storage/docs/reference-uris). A video
57+ // URI may include wildcards in `object-id`, and thus identify multiple
58+ // videos. Supported wildcards: '*' to match 0 or more characters;
6259 // '?' to match 1 character. If unset, the input video should be embedded
6360 // in the request as `input_content`. If set, `input_content` should be unset.
6461 string input_uri = 1 ;
@@ -78,8 +75,8 @@ message AnnotateVideoRequest {
7875 // Currently, only [Google Cloud Storage](https://cloud.google.com/storage/)
7976 // URIs are supported, which must be specified in the following format:
8077 // `gs://bucket-id/object-id` (other URI formats return
81- // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
82- // [Request URIs](/storage/docs/reference-uris).
78+ // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
79+ // more information, see [Request URIs](/storage/docs/reference-uris).
8380 string output_uri = 4 ;
8481
8582 // Optional cloud region where annotation should take place. Supported cloud
@@ -104,6 +101,9 @@ message VideoContext {
104101 // Config for EXPLICIT_CONTENT_DETECTION.
105102 ExplicitContentDetectionConfig explicit_content_detection_config = 4 ;
106103
104+ // Config for FACE_DETECTION.
105+ FaceDetectionConfig face_detection_config = 5 ;
106+
107107 // Config for SPEECH_TRANSCRIPTION.
108108 SpeechTranscriptionConfig speech_transcription_config = 6 ;
109109
@@ -114,66 +114,6 @@ message VideoContext {
114114 ObjectTrackingConfig object_tracking_config = 13 ;
115115}
116116
117- // Video annotation feature.
118- enum Feature {
119- // Unspecified.
120- FEATURE_UNSPECIFIED = 0 ;
121-
122- // Label detection. Detect objects, such as dog or flower.
123- LABEL_DETECTION = 1 ;
124-
125- // Shot change detection.
126- SHOT_CHANGE_DETECTION = 2 ;
127-
128- // Explicit content detection.
129- EXPLICIT_CONTENT_DETECTION = 3 ;
130-
131- // Speech transcription.
132- SPEECH_TRANSCRIPTION = 6 ;
133-
134- // OCR text detection and tracking.
135- TEXT_DETECTION = 7 ;
136-
137- // Object detection and tracking.
138- OBJECT_TRACKING = 9 ;
139- }
140-
141- // Label detection mode.
142- enum LabelDetectionMode {
143- // Unspecified.
144- LABEL_DETECTION_MODE_UNSPECIFIED = 0 ;
145-
146- // Detect shot-level labels.
147- SHOT_MODE = 1 ;
148-
149- // Detect frame-level labels.
150- FRAME_MODE = 2 ;
151-
152- // Detect both shot-level and frame-level labels.
153- SHOT_AND_FRAME_MODE = 3 ;
154- }
155-
156- // Bucketized representation of likelihood.
157- enum Likelihood {
158- // Unspecified likelihood.
159- LIKELIHOOD_UNSPECIFIED = 0 ;
160-
161- // Very unlikely.
162- VERY_UNLIKELY = 1 ;
163-
164- // Unlikely.
165- UNLIKELY = 2 ;
166-
167- // Possible.
168- POSSIBLE = 3 ;
169-
170- // Likely.
171- LIKELY = 4 ;
172-
173- // Very likely.
174- VERY_LIKELY = 5 ;
175- }
176-
177117// Config for LABEL_DETECTION.
178118message LabelDetectionConfig {
179119 // What labels should be detected with LABEL_DETECTION, in addition to
@@ -216,17 +156,28 @@ message ShotChangeDetectionConfig {
216156 string model = 1 ;
217157}
218158
219- // Config for OBJECT_TRACKING .
220- message ObjectTrackingConfig {
221- // Model to use for object tracking .
159+ // Config for EXPLICIT_CONTENT_DETECTION .
160+ message ExplicitContentDetectionConfig {
161+ // Model to use for explicit content detection .
222162 // Supported values: "builtin/stable" (the default if unset) and
223163 // "builtin/latest".
224164 string model = 1 ;
225165}
226166
227- // Config for EXPLICIT_CONTENT_DETECTION.
228- message ExplicitContentDetectionConfig {
229- // Model to use for explicit content detection.
167+ // Config for FACE_DETECTION.
168+ message FaceDetectionConfig {
169+ // Model to use for face detection.
170+ // Supported values: "builtin/stable" (the default if unset) and
171+ // "builtin/latest".
172+ string model = 1 ;
173+
174+ // Whether bounding boxes be included in the face annotation output.
175+ bool include_bounding_boxes = 2 ;
176+ }
177+
178+ // Config for OBJECT_TRACKING.
179+ message ObjectTrackingConfig {
180+ // Model to use for object tracking.
230181 // Supported values: "builtin/stable" (the default if unset) and
231182 // "builtin/latest".
232183 string model = 1 ;
@@ -344,24 +295,57 @@ message NormalizedBoundingBox {
344295 float bottom = 4 ;
345296}
346297
298+ // Video segment level annotation results for face detection.
299+ message FaceSegment {
300+ // Video segment where a face was detected.
301+ VideoSegment segment = 1 ;
302+ }
303+
304+ // Video frame level annotation results for face detection.
305+ message FaceFrame {
306+ // Normalized Bounding boxes in a frame.
307+ // There can be more than one boxes if the same face is detected in multiple
308+ // locations within the current frame.
309+ repeated NormalizedBoundingBox normalized_bounding_boxes = 1 ;
310+
311+ // Time-offset, relative to the beginning of the video,
312+ // corresponding to the video frame for this location.
313+ google.protobuf.Duration time_offset = 2 ;
314+ }
315+
316+ // Face annotation.
317+ message FaceAnnotation {
318+ // Thumbnail of a representative face view (in JPEG format).
319+ bytes thumbnail = 1 ;
320+
321+ // All video segments where a face was detected.
322+ repeated FaceSegment segments = 2 ;
323+
324+ // All video frames where a face was detected.
325+ repeated FaceFrame frames = 3 ;
326+ }
327+
347328// Annotation results for a single video.
348329message VideoAnnotationResults {
349330 // Video file location in
350331 // [Google Cloud Storage](https://cloud.google.com/storage/).
351332 string input_uri = 1 ;
352333
353- // Topical label annotations on video level or user specified segment level.
334+ // Label annotations on video level or user specified segment level.
354335 // There is exactly one element for each unique label.
355336 repeated LabelAnnotation segment_label_annotations = 2 ;
356337
357- // Topical label annotations on shot level.
338+ // Label annotations on shot level.
358339 // There is exactly one element for each unique label.
359340 repeated LabelAnnotation shot_label_annotations = 3 ;
360341
361342 // Label annotations on frame level.
362343 // There is exactly one element for each unique label.
363344 repeated LabelAnnotation frame_label_annotations = 4 ;
364345
346+ // Face annotations. There is exactly one element for each unique face.
347+ repeated FaceAnnotation face_annotations = 5 ;
348+
365349 // Shot annotations. Each shot is represented as a video segment.
366350 repeated VideoSegment shot_annotations = 6 ;
367351
@@ -407,14 +391,6 @@ message VideoAnnotationProgress {
407391
408392 // Time of the most recent update.
409393 google.protobuf.Timestamp update_time = 4 ;
410-
411- // Specifies which feature is being tracked if the request contains more than
412- // one features.
413- Feature feature = 5 ;
414-
415- // Specifies which segment is being tracked if the request contains more than
416- // one segments.
417- VideoSegment segment = 6 ;
418394}
419395
420396// Video annotation progress. Included in the `metadata`
@@ -515,17 +491,15 @@ message SpeechRecognitionAlternative {
515491 // Transcript text representing the words that the user spoke.
516492 string transcript = 1 ;
517493
518- // Output only. The confidence estimate between 0.0 and 1.0. A higher number
494+ // The confidence estimate between 0.0 and 1.0. A higher number
519495 // indicates an estimated greater likelihood that the recognized words are
520- // correct. This field is set only for the top alternative.
521- // This field is not guaranteed to be accurate and users should not rely on it
522- // to be always provided .
496+ // correct. This field is typically provided only for the top hypothesis, and
497+ // only for `is_final=true` results. Clients should not rely on the
498+ // `confidence` field as it is not guaranteed to be accurate or consistent .
523499 // The default of 0.0 is a sentinel value indicating `confidence` was not set.
524500 float confidence = 2 ;
525501
526- // Output only. A list of word-specific information for each recognized word.
527- // Note: When `enable_speaker_diarization` is true, you will see all the words
528- // from the beginning of the audio.
502+ // A list of word-specific information for each recognized word.
529503 repeated WordInfo words = 3 ;
530504}
531505
@@ -671,3 +645,66 @@ message ObjectTrackingAnnotation {
671645 // Streaming mode: it can only be one ObjectTrackingFrame message in frames.
672646 repeated ObjectTrackingFrame frames = 2 ;
673647}
648+
649+ // Video annotation feature.
650+ enum Feature {
651+ // Unspecified.
652+ FEATURE_UNSPECIFIED = 0 ;
653+
654+ // Label detection. Detect objects, such as dog or flower.
655+ LABEL_DETECTION = 1 ;
656+
657+ // Shot change detection.
658+ SHOT_CHANGE_DETECTION = 2 ;
659+
660+ // Explicit content detection.
661+ EXPLICIT_CONTENT_DETECTION = 3 ;
662+
663+ // Human face detection and tracking.
664+ FACE_DETECTION = 4 ;
665+
666+ // Speech transcription.
667+ SPEECH_TRANSCRIPTION = 6 ;
668+
669+ // OCR text detection and tracking.
670+ TEXT_DETECTION = 7 ;
671+
672+ // Object detection and tracking.
673+ OBJECT_TRACKING = 9 ;
674+ }
675+
676+ // Label detection mode.
677+ enum LabelDetectionMode {
678+ // Unspecified.
679+ LABEL_DETECTION_MODE_UNSPECIFIED = 0 ;
680+
681+ // Detect shot-level labels.
682+ SHOT_MODE = 1 ;
683+
684+ // Detect frame-level labels.
685+ FRAME_MODE = 2 ;
686+
687+ // Detect both shot-level and frame-level labels.
688+ SHOT_AND_FRAME_MODE = 3 ;
689+ }
690+
691+ // Bucketized representation of likelihood.
692+ enum Likelihood {
693+ // Unspecified likelihood.
694+ LIKELIHOOD_UNSPECIFIED = 0 ;
695+
696+ // Very unlikely.
697+ VERY_UNLIKELY = 1 ;
698+
699+ // Unlikely.
700+ UNLIKELY = 2 ;
701+
702+ // Possible.
703+ POSSIBLE = 3 ;
704+
705+ // Likely.
706+ LIKELY = 4 ;
707+
708+ // Very likely.
709+ VERY_LIKELY = 5 ;
710+ }
0 commit comments