Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ class Feature(enum.IntEnum):
LABEL_DETECTION (int): Label detection. Detect objects, such as dog or flower.
SHOT_CHANGE_DETECTION (int): Shot change detection.
EXPLICIT_CONTENT_DETECTION (int): Explicit content detection.
FACE_DETECTION (int): Human face detection and tracking.
SPEECH_TRANSCRIPTION (int): Speech transcription.
TEXT_DETECTION (int): OCR text detection and tracking.
OBJECT_TRACKING (int): Object detection and tracking.
Expand All @@ -38,7 +37,6 @@ class Feature(enum.IntEnum):
LABEL_DETECTION = 1
SHOT_CHANGE_DETECTION = 2
EXPLICIT_CONTENT_DETECTION = 3
FACE_DETECTION = 4
SPEECH_TRANSCRIPTION = 6
TEXT_DETECTION = 7
OBJECT_TRACKING = 9
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
},
"methods": {
"AnnotateVideo": {
"timeout_millis": 600000,
"timeout_millis": 60000,
"retry_codes_name": "idempotent",
"retry_params_name": "default",
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2018 Google LLC.
// Copyright 2019 Google LLC.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -22,6 +22,7 @@ import "google/longrunning/operations.proto";
import "google/protobuf/duration.proto";
import "google/protobuf/timestamp.proto";
import "google/rpc/status.proto";
import "google/api/client.proto";

option csharp_namespace = "Google.Cloud.VideoIntelligence.V1";
option go_package = "google.golang.org/genproto/googleapis/cloud/videointelligence/v1;videointelligence";
Expand All @@ -33,12 +34,14 @@ option ruby_package = "Google::Cloud::VideoIntelligence::V1";

// Service that implements Google Cloud Video Intelligence API.
service VideoIntelligenceService {
option (google.api.default_host) = "videointelligence.googleapis.com";
option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform";

// Performs asynchronous video annotation. Progress and results can be
// retrieved through the `google.longrunning.Operations` interface.
// `Operation.metadata` contains `AnnotateVideoProgress` (progress).
// `Operation.response` contains `AnnotateVideoResponse` (results).
rpc AnnotateVideo(AnnotateVideoRequest)
returns (google.longrunning.Operation) {
rpc AnnotateVideo(AnnotateVideoRequest) returns (google.longrunning.Operation) {
option (google.api.http) = {
post: "/v1/videos:annotate"
body: "*"
Expand All @@ -52,10 +55,10 @@ message AnnotateVideoRequest {
// [Google Cloud Storage](https://cloud.google.com/storage/) URIs are
// supported, which must be specified in the following format:
// `gs://bucket-id/object-id` (other URI formats return
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
// more information, see [Request URIs](/storage/docs/reference-uris). A video
// URI may include wildcards in `object-id`, and thus identify multiple
// videos. Supported wildcards: '*' to match 0 or more characters;
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
// [Request URIs](/storage/docs/reference-uris).
// A video URI may include wildcards in `object-id`, and thus identify
// multiple videos. Supported wildcards: '*' to match 0 or more characters;
// '?' to match 1 character. If unset, the input video should be embedded
// in the request as `input_content`. If set, `input_content` should be unset.
string input_uri = 1;
Expand All @@ -75,8 +78,8 @@ message AnnotateVideoRequest {
// Currently, only [Google Cloud Storage](https://cloud.google.com/storage/)
// URIs are supported, which must be specified in the following format:
// `gs://bucket-id/object-id` (other URI formats return
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
// more information, see [Request URIs](/storage/docs/reference-uris).
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
// [Request URIs](/storage/docs/reference-uris).
string output_uri = 4;

// Optional cloud region where annotation should take place. Supported cloud
Expand All @@ -101,9 +104,6 @@ message VideoContext {
// Config for EXPLICIT_CONTENT_DETECTION.
ExplicitContentDetectionConfig explicit_content_detection_config = 4;

// Config for FACE_DETECTION.
FaceDetectionConfig face_detection_config = 5;

// Config for SPEECH_TRANSCRIPTION.
SpeechTranscriptionConfig speech_transcription_config = 6;

Expand All @@ -114,6 +114,66 @@ message VideoContext {
ObjectTrackingConfig object_tracking_config = 13;
}

// Video annotation feature.
enum Feature {
// Unspecified.
FEATURE_UNSPECIFIED = 0;

// Label detection. Detect objects, such as dog or flower.
LABEL_DETECTION = 1;

// Shot change detection.
SHOT_CHANGE_DETECTION = 2;

// Explicit content detection.
EXPLICIT_CONTENT_DETECTION = 3;

// Speech transcription.
SPEECH_TRANSCRIPTION = 6;

// OCR text detection and tracking.
TEXT_DETECTION = 7;

// Object detection and tracking.
OBJECT_TRACKING = 9;
}

// Label detection mode.
enum LabelDetectionMode {
// Unspecified.
LABEL_DETECTION_MODE_UNSPECIFIED = 0;

// Detect shot-level labels.
SHOT_MODE = 1;

// Detect frame-level labels.
FRAME_MODE = 2;

// Detect both shot-level and frame-level labels.
SHOT_AND_FRAME_MODE = 3;
}

// Bucketized representation of likelihood.
enum Likelihood {
// Unspecified likelihood.
LIKELIHOOD_UNSPECIFIED = 0;

// Very unlikely.
VERY_UNLIKELY = 1;

// Unlikely.
UNLIKELY = 2;

// Possible.
POSSIBLE = 3;

// Likely.
LIKELY = 4;

// Very likely.
VERY_LIKELY = 5;
}

// Config for LABEL_DETECTION.
message LabelDetectionConfig {
// What labels should be detected with LABEL_DETECTION, in addition to
Expand Down Expand Up @@ -156,28 +216,17 @@ message ShotChangeDetectionConfig {
string model = 1;
}

// Config for EXPLICIT_CONTENT_DETECTION.
message ExplicitContentDetectionConfig {
// Model to use for explicit content detection.
// Supported values: "builtin/stable" (the default if unset) and
// "builtin/latest".
string model = 1;
}

// Config for FACE_DETECTION.
message FaceDetectionConfig {
// Model to use for face detection.
// Config for OBJECT_TRACKING.
message ObjectTrackingConfig {
// Model to use for object tracking.
// Supported values: "builtin/stable" (the default if unset) and
// "builtin/latest".
string model = 1;

// Whether bounding boxes be included in the face annotation output.
bool include_bounding_boxes = 2;
}

// Config for OBJECT_TRACKING.
message ObjectTrackingConfig {
// Model to use for object tracking.
// Config for EXPLICIT_CONTENT_DETECTION.
message ExplicitContentDetectionConfig {
// Model to use for explicit content detection.
// Supported values: "builtin/stable" (the default if unset) and
// "builtin/latest".
string model = 1;
Expand Down Expand Up @@ -295,57 +344,24 @@ message NormalizedBoundingBox {
float bottom = 4;
}

// Video segment level annotation results for face detection.
message FaceSegment {
// Video segment where a face was detected.
VideoSegment segment = 1;
}

// Video frame level annotation results for face detection.
message FaceFrame {
// Normalized Bounding boxes in a frame.
// There can be more than one boxes if the same face is detected in multiple
// locations within the current frame.
repeated NormalizedBoundingBox normalized_bounding_boxes = 1;

// Time-offset, relative to the beginning of the video,
// corresponding to the video frame for this location.
google.protobuf.Duration time_offset = 2;
}

// Face annotation.
message FaceAnnotation {
// Thumbnail of a representative face view (in JPEG format).
bytes thumbnail = 1;

// All video segments where a face was detected.
repeated FaceSegment segments = 2;

// All video frames where a face was detected.
repeated FaceFrame frames = 3;
}

// Annotation results for a single video.
message VideoAnnotationResults {
// Video file location in
// [Google Cloud Storage](https://cloud.google.com/storage/).
string input_uri = 1;

// Label annotations on video level or user specified segment level.
// Topical label annotations on video level or user specified segment level.
// There is exactly one element for each unique label.
repeated LabelAnnotation segment_label_annotations = 2;

// Label annotations on shot level.
// Topical label annotations on shot level.
// There is exactly one element for each unique label.
repeated LabelAnnotation shot_label_annotations = 3;

// Label annotations on frame level.
// There is exactly one element for each unique label.
repeated LabelAnnotation frame_label_annotations = 4;

// Face annotations. There is exactly one element for each unique face.
repeated FaceAnnotation face_annotations = 5;

// Shot annotations. Each shot is represented as a video segment.
repeated VideoSegment shot_annotations = 6;

Expand Down Expand Up @@ -391,6 +407,14 @@ message VideoAnnotationProgress {

// Time of the most recent update.
google.protobuf.Timestamp update_time = 4;

// Specifies which feature is being tracked if the request contains more than
// one features.
Feature feature = 5;

// Specifies which segment is being tracked if the request contains more than
// one segments.
VideoSegment segment = 6;
}

// Video annotation progress. Included in the `metadata`
Expand Down Expand Up @@ -491,15 +515,17 @@ message SpeechRecognitionAlternative {
// Transcript text representing the words that the user spoke.
string transcript = 1;

// The confidence estimate between 0.0 and 1.0. A higher number
// Output only. The confidence estimate between 0.0 and 1.0. A higher number
// indicates an estimated greater likelihood that the recognized words are
// correct. This field is typically provided only for the top hypothesis, and
// only for `is_final=true` results. Clients should not rely on the
// `confidence` field as it is not guaranteed to be accurate or consistent.
// correct. This field is set only for the top alternative.
// This field is not guaranteed to be accurate and users should not rely on it
// to be always provided.
// The default of 0.0 is a sentinel value indicating `confidence` was not set.
float confidence = 2;

// A list of word-specific information for each recognized word.
// Output only. A list of word-specific information for each recognized word.
// Note: When `enable_speaker_diarization` is true, you will see all the words
// from the beginning of the audio.
repeated WordInfo words = 3;
}

Expand Down Expand Up @@ -645,66 +671,3 @@ message ObjectTrackingAnnotation {
// Streaming mode: it can only be one ObjectTrackingFrame message in frames.
repeated ObjectTrackingFrame frames = 2;
}

// Video annotation feature.
enum Feature {
// Unspecified.
FEATURE_UNSPECIFIED = 0;

// Label detection. Detect objects, such as dog or flower.
LABEL_DETECTION = 1;

// Shot change detection.
SHOT_CHANGE_DETECTION = 2;

// Explicit content detection.
EXPLICIT_CONTENT_DETECTION = 3;

// Human face detection and tracking.
FACE_DETECTION = 4;

// Speech transcription.
SPEECH_TRANSCRIPTION = 6;

// OCR text detection and tracking.
TEXT_DETECTION = 7;

// Object detection and tracking.
OBJECT_TRACKING = 9;
}

// Label detection mode.
enum LabelDetectionMode {
// Unspecified.
LABEL_DETECTION_MODE_UNSPECIFIED = 0;

// Detect shot-level labels.
SHOT_MODE = 1;

// Detect frame-level labels.
FRAME_MODE = 2;

// Detect both shot-level and frame-level labels.
SHOT_AND_FRAME_MODE = 3;
}

// Bucketized representation of likelihood.
enum Likelihood {
// Unspecified likelihood.
LIKELIHOOD_UNSPECIFIED = 0;

// Very unlikely.
VERY_UNLIKELY = 1;

// Unlikely.
UNLIKELY = 2;

// Possible.
POSSIBLE = 3;

// Likely.
LIKELY = 4;

// Very likely.
VERY_LIKELY = 5;
}
Loading