diff --git a/sdk/cognitiveservices/azure-ai-transcription/CHANGELOG.md b/sdk/cognitiveservices/azure-ai-transcription/CHANGELOG.md index 850550c553c8..8cfa85875e9e 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/CHANGELOG.md +++ b/sdk/cognitiveservices/azure-ai-transcription/CHANGELOG.md @@ -1,5 +1,15 @@ # Release History +## 1.0.0b3 (2026-02-04) + +### Features Added + +- Enhanced Mode now automatically sets `enabled=True` when `task`, `target_language`, or `prompt` are specified + +### Bugs Fixed + +- Fixed Enhanced Mode not being activated when using `EnhancedModeProperties` without explicitly setting `enabled=True` + ## 1.0.0b2 (2025-12-19) ### Bugs Fixed diff --git a/sdk/cognitiveservices/azure-ai-transcription/README.md b/sdk/cognitiveservices/azure-ai-transcription/README.md index 4a96edf3c0fd..18e31f3c2d97 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/README.md +++ b/sdk/cognitiveservices/azure-ai-transcription/README.md @@ -149,10 +149,18 @@ from azure.ai.transcription.models import TranscriptionContent, TranscriptionOpt # Get configuration from environment variables endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] -api_key = os.environ["AZURE_SPEECH_API_KEY"] + +# We recommend using role-based access control (RBAC) for production scenarios +api_key = os.environ.get("AZURE_SPEECH_API_KEY") +if api_key: + credential = AzureKeyCredential(api_key) +else: + from azure.identity import DefaultAzureCredential + + credential = DefaultAzureCredential() # Create the transcription client -client = TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key)) +client = TranscriptionClient(endpoint=endpoint, credential=credential) # Path to your audio file import pathlib @@ -197,10 +205,18 @@ from azure.ai.transcription.models import TranscriptionOptions # Get configuration from environment variables endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] -api_key = os.environ["AZURE_SPEECH_API_KEY"] + +# We recommend using role-based access control (RBAC) for production scenarios +api_key = os.environ.get("AZURE_SPEECH_API_KEY") +if api_key: + credential = AzureKeyCredential(api_key) +else: + from azure.identity import DefaultAzureCredential + + credential = DefaultAzureCredential() # Create the transcription client -client = TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key)) +client = TranscriptionClient(endpoint=endpoint, credential=credential) # URL to your audio file (must be publicly accessible) audio_url = "https://example.com/path/to/audio.wav" @@ -238,31 +254,29 @@ from azure.ai.transcription.models import ( # Get configuration from environment variables endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] -api_key = os.environ["AZURE_SPEECH_API_KEY"] + +# We recommend using role-based access control (RBAC) for production scenarios +api_key = os.environ.get("AZURE_SPEECH_API_KEY") +if api_key: + credential = AzureKeyCredential(api_key) +else: + from azure.identity import DefaultAzureCredential + + credential = DefaultAzureCredential() # Create the transcription client -client = TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key)) +client = TranscriptionClient(endpoint=endpoint, credential=credential) # Path to your audio file -import pathlib - audio_file_path = pathlib.Path(__file__).parent / "assets" / "audio.wav" # Open and read the audio file with open(audio_file_path, "rb") as audio_file: - # Create enhanced mode properties - # Enable enhanced mode for advanced processing capabilities - enhanced_mode = EnhancedModeProperties( - task="translation", # Specify the task type (e.g., "translation", "summarization") - target_language="es-ES", # Target language for translation - prompt=[ - "Translate the following audio to Spanish", - "Focus on technical terminology", - ], # Optional prompts to guide the enhanced mode - ) + # Enhanced mode is automatically enabled when task is specified + enhanced_mode = EnhancedModeProperties(task="transcribe") # Create transcription options with enhanced mode - options = TranscriptionOptions(locales=["en-US"], enhanced_mode=enhanced_mode) + options = TranscriptionOptions(enhanced_mode=enhanced_mode) # Create the request content request_content = TranscriptionContent(definition=options, audio=audio_file) @@ -271,14 +285,7 @@ with open(audio_file_path, "rb") as audio_file: result = client.transcribe(request_content) # Print the transcription result - print("Transcription with enhanced mode:") - print(f"{result.combined_phrases[0].text}") - - # Print individual phrases if available - if result.phrases: - print("\nDetailed phrases:") - for phrase in result.phrases: - print(f" [{phrase.offset_milliseconds}ms]: {phrase.text}") + print(result.combined_phrases[0].text) ``` @@ -296,10 +303,18 @@ from azure.ai.transcription.models import TranscriptionContent, TranscriptionOpt # Get configuration from environment variables endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] -api_key = os.environ["AZURE_SPEECH_API_KEY"] + +# We recommend using role-based access control (RBAC) for production scenarios +api_key = os.environ.get("AZURE_SPEECH_API_KEY") +if api_key: + credential = AzureKeyCredential(api_key) +else: + from azure.identity.aio import DefaultAzureCredential + + credential = DefaultAzureCredential() # Create the transcription client -async with TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key)) as client: +async with TranscriptionClient(endpoint=endpoint, credential=credential) as client: # Path to your audio file import pathlib diff --git a/sdk/cognitiveservices/azure-ai-transcription/_metadata.json b/sdk/cognitiveservices/azure-ai-transcription/_metadata.json index 730a12350326..d8f3922c3623 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/_metadata.json +++ b/sdk/cognitiveservices/azure-ai-transcription/_metadata.json @@ -1,5 +1,3 @@ { - "apiVersion": "2025-10-15", - "service_name": "Cognitive Services", - "msDocService": "cognitive-services" + "apiVersion": "2025-10-15" } \ No newline at end of file diff --git a/sdk/cognitiveservices/azure-ai-transcription/assets.json b/sdk/cognitiveservices/azure-ai-transcription/assets.json index 3054fe8cf8d1..83197718b3ae 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/assets.json +++ b/sdk/cognitiveservices/azure-ai-transcription/assets.json @@ -2,5 +2,5 @@ "AssetsRepo": "Azure/azure-sdk-assets", "AssetsRepoPrefixPath": "python", "TagPrefix": "python/cognitiveservices/azure-ai-transcription", - "Tag": "python/cognitiveservices/azure-ai-transcription_5f9f60e291" + "Tag": "python/cognitiveservices/azure-ai-transcription_807296d8e0" } diff --git a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_client.py b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_client.py index 0db2bd87f0e2..dd0f743b3760 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_client.py +++ b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_client.py @@ -27,7 +27,7 @@ class TranscriptionClient(_TranscriptionClientOperationsMixin): """TranscriptionClient. :param endpoint: Supported Cognitive Services endpoints (protocol and hostname, for example: - `https://westus.api.cognitive.microsoft.com `_. + `https://westus.api.cognitive.microsoft.com `_). Required. :type endpoint: str :param credential: Credential used to authenticate requests to the service. Is either a key diff --git a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_configuration.py b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_configuration.py index ac72b15700dd..7d2baf2d3db3 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_configuration.py +++ b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_configuration.py @@ -24,7 +24,7 @@ class TranscriptionClientConfiguration: # pylint: disable=too-many-instance-att attributes. :param endpoint: Supported Cognitive Services endpoints (protocol and hostname, for example: - `https://westus.api.cognitive.microsoft.com `_. + `https://westus.api.cognitive.microsoft.com `_). Required. :type endpoint: str :param credential: Credential used to authenticate requests to the service. Is either a key diff --git a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_operations/_operations.py b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_operations/_operations.py index 7baa39bdacf9..cd70f3c52e6c 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_operations/_operations.py +++ b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_operations/_operations.py @@ -110,12 +110,11 @@ def transcribe(self, body: Union[_models.TranscriptionContent, JSON], **kwargs: _body = body.as_dict() if isinstance(body, _Model) else body _file_fields: list[str] = ["audio"] _data_fields: list[str] = ["definition"] - _files, _data = prepare_multipart_form_data(_body, _file_fields, _data_fields) + _files = prepare_multipart_form_data(_body, _file_fields, _data_fields) _request = build_transcription_transcribe_request( api_version=self._config.api_version, files=_files, - data=_data, headers=_headers, params=_params, ) diff --git a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_operations/_patch.py b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_operations/_patch.py index 263038ad743f..af6dc07b7584 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_operations/_patch.py +++ b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_operations/_patch.py @@ -12,7 +12,14 @@ from typing import Any, Optional import json from azure.core.tracing.decorator import distributed_trace -from azure.core.exceptions import map_error, HttpResponseError, ClientAuthenticationError, ResourceNotFoundError, ResourceExistsError, ResourceNotModifiedError +from azure.core.exceptions import ( + map_error, + HttpResponseError, + ClientAuthenticationError, + ResourceNotFoundError, + ResourceExistsError, + ResourceNotModifiedError, +) from .. import models as _models from .._utils.model_base import _deserialize, SdkJSONEncoder @@ -93,7 +100,9 @@ def transcribe_from_url( } _request.url = self._client.format_url(_request.url, **path_format_arguments) - pipeline_response = self._client._pipeline.run(_request, stream=False, **kwargs) # pylint: disable=protected-access + pipeline_response = self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=False, **kwargs + ) response = pipeline_response.http_response if response.status_code not in [200]: diff --git a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_utils/model_base.py b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_utils/model_base.py index 12926fa98dcf..c402af2afc63 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_utils/model_base.py +++ b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_utils/model_base.py @@ -37,6 +37,7 @@ TZ_UTC = timezone.utc _T = typing.TypeVar("_T") +_NONE_TYPE = type(None) def _timedelta_as_isostr(td: timedelta) -> str: @@ -171,6 +172,21 @@ def default(self, o): # pylint: disable=too-many-return-statements r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{4}\s\d{2}:\d{2}:\d{2}\sGMT" ) +_ARRAY_ENCODE_MAPPING = { + "pipeDelimited": "|", + "spaceDelimited": " ", + "commaDelimited": ",", + "newlineDelimited": "\n", +} + + +def _deserialize_array_encoded(delimit: str, attr): + if isinstance(attr, str): + if attr == "": + return [] + return attr.split(delimit) + return attr + def _deserialize_datetime(attr: typing.Union[str, datetime]) -> datetime: """Deserialize ISO-8601 formatted string into Datetime object. @@ -202,7 +218,7 @@ def _deserialize_datetime(attr: typing.Union[str, datetime]) -> datetime: test_utc = date_obj.utctimetuple() if test_utc.tm_year > 9999 or test_utc.tm_year < 1: raise OverflowError("Hit max or min date") - return date_obj + return date_obj # type: ignore[no-any-return] def _deserialize_datetime_rfc7231(attr: typing.Union[str, datetime]) -> datetime: @@ -256,7 +272,7 @@ def _deserialize_time(attr: typing.Union[str, time]) -> time: """ if isinstance(attr, time): return attr - return isodate.parse_time(attr) + return isodate.parse_time(attr) # type: ignore[no-any-return] def _deserialize_bytes(attr): @@ -315,6 +331,8 @@ def _deserialize_int_as_str(attr): def get_deserializer(annotation: typing.Any, rf: typing.Optional["_RestField"] = None): if annotation is int and rf and rf._format == "str": return _deserialize_int_as_str + if annotation is str and rf and rf._format in _ARRAY_ENCODE_MAPPING: + return functools.partial(_deserialize_array_encoded, _ARRAY_ENCODE_MAPPING[rf._format]) if rf and rf._format: return _DESERIALIZE_MAPPING_WITHFORMAT.get(rf._format) return _DESERIALIZE_MAPPING.get(annotation) # pyright: ignore @@ -353,9 +371,39 @@ def __contains__(self, key: typing.Any) -> bool: return key in self._data def __getitem__(self, key: str) -> typing.Any: + # If this key has been deserialized (for mutable types), we need to handle serialization + if hasattr(self, "_attr_to_rest_field"): + cache_attr = f"_deserialized_{key}" + if hasattr(self, cache_attr): + rf = _get_rest_field(getattr(self, "_attr_to_rest_field"), key) + if rf: + value = self._data.get(key) + if isinstance(value, (dict, list, set)): + # For mutable types, serialize and return + # But also update _data with serialized form and clear flag + # so mutations via this returned value affect _data + serialized = _serialize(value, rf._format) + # If serialized form is same type (no transformation needed), + # return _data directly so mutations work + if isinstance(serialized, type(value)) and serialized == value: + return self._data.get(key) + # Otherwise return serialized copy and clear flag + try: + object.__delattr__(self, cache_attr) + except AttributeError: + pass + # Store serialized form back + self._data[key] = serialized + return serialized return self._data.__getitem__(key) def __setitem__(self, key: str, value: typing.Any) -> None: + # Clear any cached deserialized value when setting through dictionary access + cache_attr = f"_deserialized_{key}" + try: + object.__delattr__(self, cache_attr) + except AttributeError: + pass self._data.__setitem__(key, value) def __delitem__(self, key: str) -> None: @@ -483,6 +531,8 @@ def _is_model(obj: typing.Any) -> bool: def _serialize(o, format: typing.Optional[str] = None): # pylint: disable=too-many-return-statements if isinstance(o, list): + if format in _ARRAY_ENCODE_MAPPING and all(isinstance(x, str) for x in o): + return _ARRAY_ENCODE_MAPPING[format].join(o) return [_serialize(x, format) for x in o] if isinstance(o, dict): return {k: _serialize(v, format) for k, v in o.items()} @@ -758,6 +808,14 @@ def _deserialize_multiple_sequence( return type(obj)(_deserialize(deserializer, entry, module) for entry, deserializer in zip(obj, entry_deserializers)) +def _is_array_encoded_deserializer(deserializer: functools.partial) -> bool: + return ( + isinstance(deserializer, functools.partial) + and isinstance(deserializer.args[0], functools.partial) + and deserializer.args[0].func == _deserialize_array_encoded # pylint: disable=comparison-with-callable + ) + + def _deserialize_sequence( deserializer: typing.Optional[typing.Callable], module: typing.Optional[str], @@ -767,6 +825,19 @@ def _deserialize_sequence( return obj if isinstance(obj, ET.Element): obj = list(obj) + + # encoded string may be deserialized to sequence + if isinstance(obj, str) and isinstance(deserializer, functools.partial): + # for list[str] + if _is_array_encoded_deserializer(deserializer): + return deserializer(obj) + + # for list[Union[...]] + if isinstance(deserializer.args[0], list): + for sub_deserializer in deserializer.args[0]: + if _is_array_encoded_deserializer(sub_deserializer): + return sub_deserializer(obj) + return type(obj)(_deserialize(deserializer, entry, module) for entry in obj) @@ -817,16 +888,16 @@ def _get_deserialize_callable_from_annotation( # pylint: disable=too-many-retur # is it optional? try: - if any(a for a in annotation.__args__ if a == type(None)): # pyright: ignore + if any(a is _NONE_TYPE for a in annotation.__args__): # pyright: ignore if len(annotation.__args__) <= 2: # pyright: ignore if_obj_deserializer = _get_deserialize_callable_from_annotation( - next(a for a in annotation.__args__ if a != type(None)), module, rf # pyright: ignore + next(a for a in annotation.__args__ if a is not _NONE_TYPE), module, rf # pyright: ignore ) return functools.partial(_deserialize_with_optional, if_obj_deserializer) # the type is Optional[Union[...]], we need to remove the None type from the Union annotation_copy = copy.copy(annotation) - annotation_copy.__args__ = [a for a in annotation_copy.__args__ if a != type(None)] # pyright: ignore + annotation_copy.__args__ = [a for a in annotation_copy.__args__ if a is not _NONE_TYPE] # pyright: ignore return _get_deserialize_callable_from_annotation(annotation_copy, module, rf) except AttributeError: pass @@ -952,7 +1023,7 @@ def _failsafe_deserialize( ) -> typing.Any: try: return _deserialize(deserializer, response.json(), module, rf, format) - except DeserializationError: + except Exception: # pylint: disable=broad-except _LOGGER.warning( "Ran into a deserialization error. Ignoring since this is failsafe deserialization", exc_info=True ) @@ -965,7 +1036,7 @@ def _failsafe_deserialize_xml( ) -> typing.Any: try: return _deserialize_xml(deserializer, response.text()) - except DeserializationError: + except Exception: # pylint: disable=broad-except _LOGGER.warning( "Ran into a deserialization error. Ignoring since this is failsafe deserialization", exc_info=True ) @@ -998,7 +1069,11 @@ def __init__( @property def _class_type(self) -> typing.Any: - return getattr(self._type, "args", [None])[0] + result = getattr(self._type, "args", [None])[0] + # type may be wrapped by nested functools.partial so we need to check for that + if isinstance(result, functools.partial): + return getattr(result, "args", [None])[0] + return result @property def _rest_name(self) -> str: @@ -1009,14 +1084,37 @@ def _rest_name(self) -> str: def __get__(self, obj: Model, type=None): # pylint: disable=redefined-builtin # by this point, type and rest_name will have a value bc we default # them in __new__ of the Model class - item = obj.get(self._rest_name) + # Use _data.get() directly to avoid triggering __getitem__ which clears the cache + item = obj._data.get(self._rest_name) if item is None: return item if self._is_model: return item - return _deserialize(self._type, _serialize(item, self._format), rf=self) + + # For mutable types, we want mutations to directly affect _data + # Check if we've already deserialized this value + cache_attr = f"_deserialized_{self._rest_name}" + if hasattr(obj, cache_attr): + # Return the value from _data directly (it's been deserialized in place) + return obj._data.get(self._rest_name) + + deserialized = _deserialize(self._type, _serialize(item, self._format), rf=self) + + # For mutable types, store the deserialized value back in _data + # so mutations directly affect _data + if isinstance(deserialized, (dict, list, set)): + obj._data[self._rest_name] = deserialized + object.__setattr__(obj, cache_attr, True) # Mark as deserialized + return deserialized + + return deserialized def __set__(self, obj: Model, value) -> None: + # Clear the cached deserialized object when setting a new value + cache_attr = f"_deserialized_{self._rest_name}" + if hasattr(obj, cache_attr): + object.__delattr__(obj, cache_attr) + if value is None: # we want to wipe out entries if users set attr to None try: @@ -1184,7 +1282,7 @@ def _get_wrapped_element( _get_element(v, exclude_readonly, meta, wrapped_element) else: wrapped_element.text = _get_primitive_type_value(v) - return wrapped_element + return wrapped_element # type: ignore[no-any-return] def _get_primitive_type_value(v) -> str: @@ -1197,7 +1295,9 @@ def _get_primitive_type_value(v) -> str: return str(v) -def _create_xml_element(tag, prefix=None, ns=None): +def _create_xml_element( + tag: typing.Any, prefix: typing.Optional[str] = None, ns: typing.Optional[str] = None +) -> ET.Element: if prefix and ns: ET.register_namespace(prefix, ns) if ns: diff --git a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_utils/serialization.py b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_utils/serialization.py index 45a3e44e45cb..81ec1de5922b 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_utils/serialization.py +++ b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_utils/serialization.py @@ -821,13 +821,20 @@ def serialize_basic(cls, data, data_type, **kwargs): :param str data_type: Type of object in the iterable. :rtype: str, int, float, bool :return: serialized object + :raises TypeError: raise if data_type is not one of str, int, float, bool. """ custom_serializer = cls._get_custom_serializers(data_type, **kwargs) if custom_serializer: return custom_serializer(data) if data_type == "str": return cls.serialize_unicode(data) - return eval(data_type)(data) # nosec # pylint: disable=eval-used + if data_type == "int": + return int(data) + if data_type == "float": + return float(data) + if data_type == "bool": + return bool(data) + raise TypeError("Unknown basic data type: {}".format(data_type)) @classmethod def serialize_unicode(cls, data): @@ -1757,7 +1764,7 @@ def deserialize_basic(self, attr, data_type): # pylint: disable=too-many-return :param str data_type: deserialization data type. :return: Deserialized basic type. :rtype: str, int, float or bool - :raises TypeError: if string format is not valid. + :raises TypeError: if string format is not valid or data_type is not one of str, int, float, bool. """ # If we're here, data is supposed to be a basic type. # If it's still an XML node, take the text @@ -1783,7 +1790,11 @@ def deserialize_basic(self, attr, data_type): # pylint: disable=too-many-return if data_type == "str": return self.deserialize_unicode(attr) - return eval(data_type)(attr) # nosec # pylint: disable=eval-used + if data_type == "int": + return int(attr) + if data_type == "float": + return float(attr) + raise TypeError("Unknown basic data type: {}".format(data_type)) @staticmethod def deserialize_unicode(data): diff --git a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_utils/utils.py b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_utils/utils.py index 4c029bd3dfa5..54fdb84a9191 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_utils/utils.py +++ b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_utils/utils.py @@ -49,9 +49,8 @@ def serialize_multipart_data_entry(data_entry: Any) -> Any: def prepare_multipart_form_data( body: Mapping[str, Any], multipart_fields: list[str], data_fields: list[str] -) -> tuple[list[FileType], dict[str, Any]]: +) -> list[FileType]: files: list[FileType] = [] - data: dict[str, Any] = {} for multipart_field in multipart_fields: multipart_entry = body.get(multipart_field) if isinstance(multipart_entry, list): @@ -59,9 +58,11 @@ def prepare_multipart_form_data( elif multipart_entry: files.append((multipart_field, multipart_entry)) + # if files is empty, sdk core library can't handle multipart/form-data correctly, so + # we put data fields into files with filename as None to avoid that scenario. for data_field in data_fields: data_entry = body.get(data_field) if data_entry: - data[data_field] = serialize_multipart_data_entry(data_entry) + files.append((data_field, str(serialize_multipart_data_entry(data_entry)))) - return files, data + return files diff --git a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_version.py b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_version.py index bbcd28b4aa67..c43fdbc2e239 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_version.py +++ b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/_version.py @@ -6,4 +6,4 @@ # Changes may cause incorrect behavior and will be lost if the code is regenerated. # -------------------------------------------------------------------------- -VERSION = "1.0.0b2" +VERSION = "1.0.0b3" diff --git a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/aio/_client.py b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/aio/_client.py index ea92bf364d24..3c0b1e71b730 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/aio/_client.py +++ b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/aio/_client.py @@ -27,7 +27,7 @@ class TranscriptionClient(_TranscriptionClientOperationsMixin): """TranscriptionClient. :param endpoint: Supported Cognitive Services endpoints (protocol and hostname, for example: - `https://westus.api.cognitive.microsoft.com `_. + `https://westus.api.cognitive.microsoft.com `_). Required. :type endpoint: str :param credential: Credential used to authenticate requests to the service. Is either a key diff --git a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/aio/_configuration.py b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/aio/_configuration.py index cf21f43beb9c..3d430e678ca3 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/aio/_configuration.py +++ b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/aio/_configuration.py @@ -24,7 +24,7 @@ class TranscriptionClientConfiguration: # pylint: disable=too-many-instance-att attributes. :param endpoint: Supported Cognitive Services endpoints (protocol and hostname, for example: - `https://westus.api.cognitive.microsoft.com `_. + `https://westus.api.cognitive.microsoft.com `_). Required. :type endpoint: str :param credential: Credential used to authenticate requests to the service. Is either a key diff --git a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/aio/_operations/_operations.py b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/aio/_operations/_operations.py index 0b1835a4223e..20a77b8ae4bd 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/aio/_operations/_operations.py +++ b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/aio/_operations/_operations.py @@ -90,12 +90,11 @@ async def transcribe( _body = body.as_dict() if isinstance(body, _Model) else body _file_fields: list[str] = ["audio"] _data_fields: list[str] = ["definition"] - _files, _data = prepare_multipart_form_data(_body, _file_fields, _data_fields) + _files = prepare_multipart_form_data(_body, _file_fields, _data_fields) _request = build_transcription_transcribe_request( api_version=self._config.api_version, files=_files, - data=_data, headers=_headers, params=_params, ) diff --git a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/aio/_operations/_patch.py b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/aio/_operations/_patch.py index 8641af408ea6..53b64e11014f 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/aio/_operations/_patch.py +++ b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/aio/_operations/_patch.py @@ -12,7 +12,14 @@ from typing import Any, Optional import json from azure.core.tracing.decorator_async import distributed_trace_async -from azure.core.exceptions import map_error, HttpResponseError, ClientAuthenticationError, ResourceNotFoundError, ResourceExistsError, ResourceNotModifiedError +from azure.core.exceptions import ( + map_error, + HttpResponseError, + ClientAuthenticationError, + ResourceNotFoundError, + ResourceExistsError, + ResourceNotModifiedError, +) from ... import models as _models from ..._utils.model_base import _deserialize, SdkJSONEncoder @@ -91,7 +98,9 @@ async def transcribe_from_url( } _request.url = self._client.format_url(_request.url, **path_format_arguments) - pipeline_response = await self._client._pipeline.run(_request, stream=False, **kwargs) # pylint: disable=protected-access + pipeline_response = await self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=False, **kwargs + ) response = pipeline_response.http_response if response.status_code not in [200]: diff --git a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/models/_enums.py b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/models/_enums.py index c6fa0da4254d..e341385368ac 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/models/_enums.py +++ b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/models/_enums.py @@ -18,6 +18,6 @@ class ProfanityFilterMode(str, Enum, metaclass=CaseInsensitiveEnumMeta): REMOVED = "Removed" """Remove profanity.""" TAGS = "Tags" - """Add "profanity" XML tags""" + """Add "profanity" XML tags.""" MASKED = "Masked" - """Mask the profanity with * except of the first letter, e.g., f***""" + """Mask the profanity with * except of the first letter, e.g., f***.""" diff --git a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/models/_models.py b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/models/_models.py index ba3dd5af6cd6..0ca8d9922af9 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/models/_models.py +++ b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/models/_models.py @@ -100,7 +100,7 @@ class PhraseListProperties(_Model): :ivar phrases: List of phrases for recognition. :vartype phrases: list[str] - :ivar biasing_weight: Biasing weight for phrase list (1.0 to 20.0). + :ivar biasing_weight: Biasing weight for phrase list (0.0-2.0). :vartype biasing_weight: float """ @@ -109,7 +109,7 @@ class PhraseListProperties(_Model): biasing_weight: Optional[float] = rest_field( name="biasingWeight", visibility=["read", "create", "update", "delete", "query"] ) - """Biasing weight for phrase list (1.0 to 20.0).""" + """Biasing weight for phrase list (0.0-2.0).""" @overload def __init__( diff --git a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/models/_patch.py b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/models/_patch.py index 87676c65a8f0..1f20b6224737 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/models/_patch.py +++ b/sdk/cognitiveservices/azure-ai-transcription/azure/ai/transcription/models/_patch.py @@ -7,9 +7,54 @@ Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize """ +from typing import Any, Optional +from ._models import EnhancedModeProperties as _EnhancedModeProperties -__all__: list[str] = [] # Add all objects you want publicly available to users at this package level + +class EnhancedModeProperties(_EnhancedModeProperties): + """Enhanced mode properties for transcription. + + :ivar task: Task type for enhanced mode. + :vartype task: str + :ivar target_language: Target language for enhanced mode. + :vartype target_language: str + :ivar prompt: A list of user prompts. + :vartype prompt: list[str] + """ + + def __init__( + self, + *, + task: Optional[str] = None, + target_language: Optional[str] = None, + prompt: Optional[list[str]] = None, + **kwargs: Any, + ) -> None: + super().__init__(task=task, target_language=target_language, prompt=prompt, **kwargs) + # Automatically set enabled=True if any enhanced mode properties are specified + # This is hidden from public API but sent to the server + self._enabled: Optional[bool] = None + if task is not None or target_language is not None or prompt is not None: + self._enabled = True + + def as_dict(self, *, exclude_readonly: bool = False) -> dict[str, Any]: + """Return a dict that can be turned into json using json.dump. + + :keyword bool exclude_readonly: Whether to remove the readonly properties. + :returns: A dict JSON compatible object + :rtype: dict + """ + result = super().as_dict(exclude_readonly=exclude_readonly) + # Always include enabled in the request if it's set + if self._enabled is not None: + result["enabled"] = self._enabled + return result + + +__all__: list[str] = [ + "EnhancedModeProperties" +] # Add all objects you want publicly available to users at this package level def patch_sdk(): diff --git a/sdk/cognitiveservices/azure-ai-transcription/cspell.json b/sdk/cognitiveservices/azure-ai-transcription/cspell.json index 26702324f058..f08faafb5cf8 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/cspell.json +++ b/sdk/cognitiveservices/azure-ai-transcription/cspell.json @@ -7,6 +7,7 @@ "words": [ "diarization", "pclp", - "pcsp" + "pcsp", + "Rehaan" ] } diff --git a/sdk/cognitiveservices/azure-ai-transcription/pyproject.toml b/sdk/cognitiveservices/azure-ai-transcription/pyproject.toml index 95d02d2172c1..96a3aebb064b 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/pyproject.toml +++ b/sdk/cognitiveservices/azure-ai-transcription/pyproject.toml @@ -32,7 +32,7 @@ keywords = ["azure", "azure sdk"] dependencies = [ "isodate>=0.6.1", - "azure-core>=1.35.0", + "azure-core>=1.37.0", "typing-extensions>=4.6.0", ] dynamic = [ diff --git a/sdk/cognitiveservices/azure-ai-transcription/samples/assets/sample-howstheweather-cn.wav b/sdk/cognitiveservices/azure-ai-transcription/samples/assets/sample-howstheweather-cn.wav new file mode 100644 index 000000000000..65d23513004b Binary files /dev/null and b/sdk/cognitiveservices/azure-ai-transcription/samples/assets/sample-howstheweather-cn.wav differ diff --git a/sdk/cognitiveservices/azure-ai-transcription/samples/assets/sample-whatstheweatherlike-en.mp3 b/sdk/cognitiveservices/azure-ai-transcription/samples/assets/sample-whatstheweatherlike-en.mp3 new file mode 100644 index 000000000000..e342abca6fb8 Binary files /dev/null and b/sdk/cognitiveservices/azure-ai-transcription/samples/assets/sample-whatstheweatherlike-en.mp3 differ diff --git a/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_audio_file_async.py b/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_audio_file_async.py index 8e73f72d5a64..72b750c213a0 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_audio_file_async.py +++ b/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_audio_file_async.py @@ -32,10 +32,18 @@ async def sample_transcribe_audio_file_async(): # Get configuration from environment variables endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] - api_key = os.environ["AZURE_SPEECH_API_KEY"] + + # We recommend using role-based access control (RBAC) for production scenarios + api_key = os.environ.get("AZURE_SPEECH_API_KEY") + if api_key: + credential = AzureKeyCredential(api_key) + else: + from azure.identity.aio import DefaultAzureCredential + + credential = DefaultAzureCredential() # Create the transcription client - async with TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key)) as client: + async with TranscriptionClient(endpoint=endpoint, credential=credential) as client: # Path to your audio file import pathlib diff --git a/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_from_url_async.py b/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_from_url_async.py index d4f8fbbc83b6..ce71ae7d4a67 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_from_url_async.py +++ b/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_from_url_async.py @@ -32,10 +32,18 @@ async def sample_transcribe_from_url_async(): # Get configuration from environment variables endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] - api_key = os.environ["AZURE_SPEECH_API_KEY"] + + # We recommend using role-based access control (RBAC) for production scenarios + api_key = os.environ.get("AZURE_SPEECH_API_KEY") + if api_key: + credential = AzureKeyCredential(api_key) + else: + from azure.identity.aio import DefaultAzureCredential + + credential = DefaultAzureCredential() # Create the transcription client - async with TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key)) as client: + async with TranscriptionClient(endpoint=endpoint, credential=credential) as client: # URL to your audio file (must be publicly accessible) audio_url = "https://example.com/path/to/audio.wav" diff --git a/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_multiple_languages_async.py b/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_multiple_languages_async.py index 431398c6c2fd..7d5d384ef465 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_multiple_languages_async.py +++ b/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_multiple_languages_async.py @@ -9,9 +9,21 @@ FILE: sample_transcribe_multiple_languages_async.py DESCRIPTION: - This sample demonstrates how to asynchronously transcribe an audio file with - multiple language detection using the Azure AI Transcription client. - This is useful for multilingual content. + This sample demonstrates how to transcribe audio with multilingual content + using the asynchronous Azure AI Transcription client. + + When your audio contains multilingual content that switches between different + languages, use the multilingual transcription model by NOT specifying any + locales. The service will automatically detect and transcribe each language + segment. + + Supported locales: + de-DE, en-AU, en-CA, en-GB, en-IN, en-US, es-ES, es-MX, fr-CA, fr-FR, + it-IT, ja-JP, ko-KR, zh-CN + + Note: This feature is currently in preview. The multilingual model outputs + the "major locale" for each language (e.g., always "en-US" for English + regardless of accent). USAGE: python sample_transcribe_multiple_languages_async.py @@ -19,34 +31,50 @@ Set the environment variables with your own values before running the sample: 1) AZURE_SPEECH_ENDPOINT - the endpoint to your Speech resource. 2) AZURE_SPEECH_API_KEY - your Speech API key. + +RELATED RESOURCES: + - Fast transcription - Multilingual transcription: + https://learn.microsoft.com/azure/ai-services/speech-service/fast-transcription-create?tabs=multilingual-transcription-on """ import asyncio import os +import pathlib -async def sample_transcribe_multiple_languages_async(): - # [START transcribe_multiple_languages_async] +async def sample_transcribe_multilingual_async(): + """Transcribe audio with multilingual content (Preview). + + For multilingual content, do not specify any locales. The service will + automatically detect and transcribe each language segment. + """ + # [START transcribe_multilingual_async] from azure.core.credentials import AzureKeyCredential from azure.ai.transcription.aio import TranscriptionClient from azure.ai.transcription.models import TranscriptionContent, TranscriptionOptions # Get configuration from environment variables endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] - api_key = os.environ["AZURE_SPEECH_API_KEY"] - # Create the transcription client - async with TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key)) as client: - # Path to your audio file with multiple languages - import pathlib + # We recommend using role-based access control (RBAC) for production scenarios + api_key = os.environ.get("AZURE_SPEECH_API_KEY") + if api_key: + credential = AzureKeyCredential(api_key) + else: + from azure.identity.aio import DefaultAzureCredential + credential = DefaultAzureCredential() + + # Create the transcription client + async with TranscriptionClient(endpoint=endpoint, credential=credential) as client: + # Path to your audio file with multilingual content audio_file_path = pathlib.Path(__file__).parent.parent / "assets" / "audio.wav" # Open and read the audio file with open(audio_file_path, "rb") as audio_file: - # Create transcription options with multiple language candidates - # The service will detect which language is being spoken - options = TranscriptionOptions(locales=["en-US", "es-ES", "fr-FR", "de-DE"]) # Multiple candidates + # For multilingual content, do NOT specify any locales + # The service will automatically detect and transcribe each language + options = TranscriptionOptions() # Create the request content request_content = TranscriptionContent(definition=options, audio=audio_file) @@ -55,15 +83,12 @@ async def sample_transcribe_multiple_languages_async(): result = await client.transcribe(request_content) # Print the transcription result with locale information - print("Transcription with language detection:\n") - if result.phrases: - for phrase in result.phrases: - locale = phrase.locale if hasattr(phrase, "locale") and phrase.locale else "detected" - print(f"[{locale}] {phrase.text}") - else: - print(f"Full transcription: {result.combined_phrases[0].text}") - # [END transcribe_multiple_languages_async] + print("Multilingual Transcription:\n") + for phrase in result.phrases: + locale = phrase.locale if phrase.locale else "auto-detected" + print(f"[{locale}] {phrase.text}") + # [END transcribe_multilingual_async] if __name__ == "__main__": - asyncio.run(sample_transcribe_multiple_languages_async()) + asyncio.run(sample_transcribe_multilingual_async()) diff --git a/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_with_diarization_async.py b/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_with_diarization_async.py index 447d70ba3f77..0f791a858567 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_with_diarization_async.py +++ b/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_with_diarization_async.py @@ -37,10 +37,18 @@ async def sample_transcribe_with_diarization_async(): # Get configuration from environment variables endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] - api_key = os.environ["AZURE_SPEECH_API_KEY"] + + # We recommend using role-based access control (RBAC) for production scenarios + api_key = os.environ.get("AZURE_SPEECH_API_KEY") + if api_key: + credential = AzureKeyCredential(api_key) + else: + from azure.identity.aio import DefaultAzureCredential + + credential = DefaultAzureCredential() # Create the transcription client - async with TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key)) as client: + async with TranscriptionClient(endpoint=endpoint, credential=credential) as client: # Path to your audio file with multiple speakers import pathlib diff --git a/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_with_enhanced_mode_async.py b/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_with_enhanced_mode_async.py index 4fc51688bf13..5ae02b13ac5e 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_with_enhanced_mode_async.py +++ b/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_with_enhanced_mode_async.py @@ -9,9 +9,29 @@ FILE: sample_transcribe_with_enhanced_mode_async.py DESCRIPTION: - This sample demonstrates how to transcribe an audio file with enhanced mode enabled - using the asynchronous Azure AI Transcription client. Enhanced mode provides - advanced capabilities such as translation or summarization during transcription. + This sample demonstrates how to use LLM-powered Enhanced Mode for transcription + and translation using the asynchronous Azure AI Transcription client. Enhanced + Mode uses LLM-powered speech recognition to provide improved transcription + accuracy, real-time translation, prompt-based customization, and multilingual + support with GPU acceleration. + + Supported Tasks: + +-------------+--------------------------------------------------------------+ + | Task | Description | + +-------------+--------------------------------------------------------------+ + | transcribe | Transcribe audio in the input language (auto-detected or | + | | specified) | + | translate | Translate audio to a specified target language | + +-------------+--------------------------------------------------------------+ + + Limitations: + - `confidence` is not available and always returns 0 + - Word-level timing (offset_milliseconds, duration_milliseconds) is not + supported for the `translate` task + - Diarization is not supported for the `translate` task (only speaker1 + label is returned) + - `locales` and `phrase_lists` options are not required or applicable + with Enhanced Mode USAGE: python sample_transcribe_with_enhanced_mode_async.py @@ -19,13 +39,25 @@ Set the environment variables with your own values before running the sample: 1) AZURE_SPEECH_ENDPOINT - the endpoint to your Speech resource. 2) AZURE_SPEECH_API_KEY - your Speech API key. + +RELATED RESOURCES: + - LLM speech for speech transcription and translation (preview): + https://learn.microsoft.com/azure/ai-services/speech-service/llm-speech + - Fast transcription: + https://learn.microsoft.com/azure/ai-services/speech-service/fast-transcription-create """ import asyncio import os +import pathlib async def sample_transcribe_with_enhanced_mode_async(): + """Transcribe audio using Enhanced Mode for improved quality. + + Use Enhanced Mode for improved transcription quality with LLM-powered + speech recognition. + """ # [START transcribe_with_enhanced_mode_async] from azure.core.credentials import AzureKeyCredential from azure.ai.transcription.aio import TranscriptionClient @@ -37,30 +69,139 @@ async def sample_transcribe_with_enhanced_mode_async(): # Get configuration from environment variables endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] - api_key = os.environ["AZURE_SPEECH_API_KEY"] + + # We recommend using role-based access control (RBAC) for production scenarios + api_key = os.environ.get("AZURE_SPEECH_API_KEY") + if api_key: + credential = AzureKeyCredential(api_key) + else: + from azure.identity.aio import DefaultAzureCredential + + credential = DefaultAzureCredential() # Create the transcription client - async with TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key)) as client: + async with TranscriptionClient(endpoint=endpoint, credential=credential) as client: # Path to your audio file - import pathlib - audio_file_path = pathlib.Path(__file__).parent.parent / "assets" / "audio.wav" # Open and read the audio file with open(audio_file_path, "rb") as audio_file: - # Create enhanced mode properties - # Enable enhanced mode for advanced processing capabilities + # Enhanced mode is automatically enabled when task is specified + enhanced_mode = EnhancedModeProperties(task="transcribe") + + # Create transcription options with enhanced mode + options = TranscriptionOptions(enhanced_mode=enhanced_mode) + + # Create the request content + request_content = TranscriptionContent(definition=options, audio=audio_file) + + # Transcribe the audio with enhanced mode + result = await client.transcribe(request_content) + + # Print the transcription result + print(result.combined_phrases[0].text) + # [END transcribe_with_enhanced_mode_async] + + +async def sample_translate_with_enhanced_mode_async(): + """Translate speech to another language using Enhanced Mode. + + Translate speech to a target language during transcription. Specify the + target language using the language code (e.g., `en` for English, `ko` for + Korean, `es` for Spanish). + """ + # [START translate_with_enhanced_mode_async] + from azure.core.credentials import AzureKeyCredential + from azure.ai.transcription.aio import TranscriptionClient + from azure.ai.transcription.models import ( + TranscriptionContent, + TranscriptionOptions, + EnhancedModeProperties, + ) + + # Get configuration from environment variables + endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] + + # We recommend using role-based access control (RBAC) for production scenarios + api_key = os.environ.get("AZURE_SPEECH_API_KEY") + if api_key: + credential = AzureKeyCredential(api_key) + else: + from azure.identity.aio import DefaultAzureCredential + + credential = DefaultAzureCredential() + + # Create the transcription client + async with TranscriptionClient(endpoint=endpoint, credential=credential) as client: + # Path to your audio file (e.g., Chinese audio) + audio_file_path = pathlib.Path(__file__).parent.parent / "assets" / "sample-howstheweather-cn.wav" + + # Open and read the audio file + with open(audio_file_path, "rb") as audio_file: + # Translate Chinese speech to Korean enhanced_mode = EnhancedModeProperties( - task="translation", # Specify the task type (e.g., "translation", "summarization") - target_language="es-ES", # Target language for translation - prompt=[ - "Translate the following audio to Spanish", - "Focus on technical terminology", - ], # Optional prompts to guide the enhanced mode + task="translate", + target_language="ko", # Translate to Korean ) # Create transcription options with enhanced mode - options = TranscriptionOptions(locales=["en-US"], enhanced_mode=enhanced_mode) + options = TranscriptionOptions(enhanced_mode=enhanced_mode) + + # Create the request content + request_content = TranscriptionContent(definition=options, audio=audio_file) + + # Transcribe and translate the audio + result = await client.transcribe(request_content) + + # Print the translated result + print("Translated to Korean:") + print(result.combined_phrases[0].text) + # [END translate_with_enhanced_mode_async] + + +async def sample_enhanced_mode_with_prompts_async(): + """Use prompts to guide output format and improve recognition. + + Provide prompts to improve recognition or control output format. Prompts + are optional text that guides the output style for `transcribe` or + `translate` tasks. + """ + # [START enhanced_mode_with_prompts_async] + from azure.core.credentials import AzureKeyCredential + from azure.ai.transcription.aio import TranscriptionClient + from azure.ai.transcription.models import ( + TranscriptionContent, + TranscriptionOptions, + EnhancedModeProperties, + ) + + # Get configuration from environment variables + endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] + + # We recommend using role-based access control (RBAC) for production scenarios + api_key = os.environ.get("AZURE_SPEECH_API_KEY") + if api_key: + credential = AzureKeyCredential(api_key) + else: + from azure.identity.aio import DefaultAzureCredential + + credential = DefaultAzureCredential() + + # Create the transcription client + async with TranscriptionClient(endpoint=endpoint, credential=credential) as client: + # Path to your audio file + audio_file_path = pathlib.Path(__file__).parent.parent / "assets" / "sample-whatstheweatherlike-en.mp3" + + # Open and read the audio file + with open(audio_file_path, "rb") as audio_file: + # Guide output formatting using prompts + enhanced_mode = EnhancedModeProperties( + task="transcribe", + prompt=["Output must be in lexical format."], + ) + + # Create transcription options with enhanced mode + options = TranscriptionOptions(enhanced_mode=enhanced_mode) # Create the request content request_content = TranscriptionContent(definition=options, audio=audio_file) @@ -69,16 +210,99 @@ async def sample_transcribe_with_enhanced_mode_async(): result = await client.transcribe(request_content) # Print the transcription result - print("Transcription with enhanced mode:") - print(f"{result.combined_phrases[0].text}") - - # Print individual phrases if available - if result.phrases: - print("\nDetailed phrases:") - for phrase in result.phrases: - print(f" [{phrase.offset_milliseconds}ms]: {phrase.text}") - # [END transcribe_with_enhanced_mode_async] + print(result.combined_phrases[0].text) + # [END enhanced_mode_with_prompts_async] + + +async def sample_enhanced_mode_with_diarization_async(): + """Combine Enhanced Mode with diarization and profanity filtering. + + Enhanced Mode can be combined with other transcription options like + `diarization`, `profanity_filter_mode`, and `channels` for comprehensive + transcription scenarios such as meeting transcription. + + Note: Diarization is only supported for the `transcribe` task, not for + `translate`. + """ + # [START enhanced_mode_with_diarization_async] + from azure.core.credentials import AzureKeyCredential + from azure.ai.transcription.aio import TranscriptionClient + from azure.ai.transcription.models import ( + TranscriptionContent, + TranscriptionOptions, + EnhancedModeProperties, + TranscriptionDiarizationOptions, + ) + + # Get configuration from environment variables + endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] + + # We recommend using role-based access control (RBAC) for production scenarios + api_key = os.environ.get("AZURE_SPEECH_API_KEY") + if api_key: + credential = AzureKeyCredential(api_key) + else: + from azure.identity.aio import DefaultAzureCredential + + credential = DefaultAzureCredential() + + # Create the transcription client + async with TranscriptionClient(endpoint=endpoint, credential=credential) as client: + # Path to your audio file (e.g., a meeting recording) + audio_file_path = pathlib.Path(__file__).parent.parent / "assets" / "audio.wav" + + # Open and read the audio file + with open(audio_file_path, "rb") as audio_file: + # Configure enhanced mode with prompts + enhanced_mode = EnhancedModeProperties( + task="transcribe", + prompt=["Output must be in lexical format."], + ) + + # Configure diarization to identify different speakers + diarization_options = TranscriptionDiarizationOptions(max_speakers=2) + + # Create transcription options with enhanced mode, diarization, and profanity filter + options = TranscriptionOptions( + enhanced_mode=enhanced_mode, + profanity_filter_mode="Masked", + diarization_options=diarization_options, + ) + + # Create the request content + request_content = TranscriptionContent(definition=options, audio=audio_file) + + # Transcribe the audio with enhanced mode + result = await client.transcribe(request_content) + + # Print transcription with speaker information + for phrase in result.phrases: + speaker = phrase.speaker if phrase.speaker is not None else "Unknown" + print(f"[Speaker {speaker}] {phrase.text}") + # [END enhanced_mode_with_diarization_async] + + +async def main(): + print("=" * 60) + print("Sample 1: Transcribe with Enhanced Mode (Async)") + print("=" * 60) + await sample_transcribe_with_enhanced_mode_async() + + print("\n" + "=" * 60) + print("Sample 2: Translate with Enhanced Mode (Async)") + print("=" * 60) + await sample_translate_with_enhanced_mode_async() + + print("\n" + "=" * 60) + print("Sample 3: Enhanced Mode with Prompt Tuning (Async)") + print("=" * 60) + await sample_enhanced_mode_with_prompts_async() + + print("\n" + "=" * 60) + print("Sample 4: Combine Enhanced Mode with Other Options (Async)") + print("=" * 60) + await sample_enhanced_mode_with_diarization_async() if __name__ == "__main__": - asyncio.run(sample_transcribe_with_enhanced_mode_async()) + asyncio.run(main()) diff --git a/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_with_phrase_list_async.py b/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_with_phrase_list_async.py index bf04baf2c8c4..fdbc989ae325 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_with_phrase_list_async.py +++ b/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_with_phrase_list_async.py @@ -9,9 +9,17 @@ FILE: sample_transcribe_with_phrase_list_async.py DESCRIPTION: - This sample demonstrates how to asynchronously transcribe an audio file with a - custom phrase list to improve recognition accuracy for domain-specific terminology - using the Azure AI Transcription client. + This sample demonstrates how to use custom phrase lists to improve transcription + accuracy with the asynchronous Azure AI Transcription client. + + A phrase list allows you to provide domain-specific terms, product names, + technical jargon, or other words that may not be well-recognized by the + default speech model. This improves accuracy for specialized content. + + For example, without a phrase list: + - "Jessie" might be recognized as "Jesse" + - "Rehaan" might be recognized as "everyone" + - "Contoso" might be recognized as "can't do so" USAGE: python sample_transcribe_with_phrase_list_async.py @@ -23,9 +31,11 @@ import asyncio import os +import pathlib async def sample_transcribe_with_phrase_list_async(): + """Transcribe audio with a custom phrase list to improve recognition accuracy.""" # [START transcribe_with_phrase_list_async] from azure.core.credentials import AzureKeyCredential from azure.ai.transcription.aio import TranscriptionClient @@ -37,33 +47,31 @@ async def sample_transcribe_with_phrase_list_async(): # Get configuration from environment variables endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] - api_key = os.environ["AZURE_SPEECH_API_KEY"] + + # We recommend using role-based access control (RBAC) for production scenarios + api_key = os.environ.get("AZURE_SPEECH_API_KEY") + if api_key: + credential = AzureKeyCredential(api_key) + else: + from azure.identity.aio import DefaultAzureCredential + + credential = DefaultAzureCredential() # Create the transcription client - async with TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key)) as client: + async with TranscriptionClient(endpoint=endpoint, credential=credential) as client: # Path to your audio file with domain-specific terminology - import pathlib - audio_file_path = pathlib.Path(__file__).parent.parent / "assets" / "audio.wav" # Open and read the audio file with open(audio_file_path, "rb") as audio_file: - # Create a phrase list with custom terminology - # This helps improve recognition accuracy for specific words + # Add custom phrases to improve recognition of names and domain-specific terms + # For example, "Jessie" might be recognized as "Jesse", or "Contoso" as "can't do so" phrase_list = PhraseListProperties( - phrases=[ - "Azure", - "Cognitive Services", - "Speech SDK", - "TranscriptionClient", - "Kubernetes", - "microservices", - ], - biasing_weight=5.0, # Weight between 1.0 and 20.0 (higher = more bias) + phrases=["Contoso", "Jessie", "Rehaan"] ) # Create transcription options with phrase list - options = TranscriptionOptions(locales=["en-US"], phrase_list=phrase_list) + options = TranscriptionOptions(phrase_list=phrase_list) # Create the request content request_content = TranscriptionContent(definition=options, audio=audio_file) @@ -73,13 +81,7 @@ async def sample_transcribe_with_phrase_list_async(): # Print the transcription result print("Transcription with custom phrase list:") - print(f"{result.combined_phrases[0].text}") - - # Print individual phrases if available - if result.phrases: - print("\nDetailed phrases:") - for phrase in result.phrases: - print(f" [{phrase.offset_milliseconds}ms]: {phrase.text}") + print(result.combined_phrases[0].text) # [END transcribe_with_phrase_list_async] diff --git a/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_with_profanity_filter_async.py b/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_with_profanity_filter_async.py index 2b7a953d5d65..bd34e984e21c 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_with_profanity_filter_async.py +++ b/sdk/cognitiveservices/azure-ai-transcription/samples/async_samples/sample_transcribe_with_profanity_filter_async.py @@ -33,10 +33,18 @@ async def sample_transcribe_with_profanity_filter_async(): # Get configuration from environment variables endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] - api_key = os.environ["AZURE_SPEECH_API_KEY"] + + # We recommend using role-based access control (RBAC) for production scenarios + api_key = os.environ.get("AZURE_SPEECH_API_KEY") + if api_key: + credential = AzureKeyCredential(api_key) + else: + from azure.identity.aio import DefaultAzureCredential + + credential = DefaultAzureCredential() # Create the transcription client - async with TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key)) as client: + async with TranscriptionClient(endpoint=endpoint, credential=credential) as client: # Path to your audio file import pathlib diff --git a/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_audio_file.py b/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_audio_file.py index 78c037ddc011..1156c2fc7acc 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_audio_file.py +++ b/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_audio_file.py @@ -31,10 +31,18 @@ def sample_transcribe_audio_file(): # Get configuration from environment variables endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] - api_key = os.environ["AZURE_SPEECH_API_KEY"] + + # We recommend using role-based access control (RBAC) for production scenarios + api_key = os.environ.get("AZURE_SPEECH_API_KEY") + if api_key: + credential = AzureKeyCredential(api_key) + else: + from azure.identity import DefaultAzureCredential + + credential = DefaultAzureCredential() # Create the transcription client - client = TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key)) + client = TranscriptionClient(endpoint=endpoint, credential=credential) # Path to your audio file import pathlib diff --git a/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_from_url.py b/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_from_url.py index 4bb8c18479f9..5832739a4063 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_from_url.py +++ b/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_from_url.py @@ -31,10 +31,18 @@ def sample_transcribe_from_url(): # Get configuration from environment variables endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] - api_key = os.environ["AZURE_SPEECH_API_KEY"] + + # We recommend using role-based access control (RBAC) for production scenarios + api_key = os.environ.get("AZURE_SPEECH_API_KEY") + if api_key: + credential = AzureKeyCredential(api_key) + else: + from azure.identity import DefaultAzureCredential + + credential = DefaultAzureCredential() # Create the transcription client - client = TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key)) + client = TranscriptionClient(endpoint=endpoint, credential=credential) # URL to your audio file (must be publicly accessible) audio_url = "https://example.com/path/to/audio.wav" diff --git a/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_multiple_languages.py b/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_multiple_languages.py index 46af6dcdd09e..d83580877c5c 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_multiple_languages.py +++ b/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_multiple_languages.py @@ -9,9 +9,21 @@ FILE: sample_transcribe_multiple_languages.py DESCRIPTION: - This sample demonstrates how to transcribe an audio file with multiple language - detection using the Azure AI Transcription client. This is useful for - multilingual content. + This sample demonstrates how to transcribe audio with multilingual content + using the Azure AI Transcription client. + + When your audio contains multilingual content that switches between different + languages, use the multilingual transcription model by NOT specifying any + locales. The service will automatically detect and transcribe each language + segment. + + Supported locales: + de-DE, en-AU, en-CA, en-GB, en-IN, en-US, es-ES, es-MX, fr-CA, fr-FR, + it-IT, ja-JP, ko-KR, zh-CN + + Note: This feature is currently in preview. The multilingual model outputs + the "major locale" for each language (e.g., always "en-US" for English + regardless of accent). USAGE: python sample_transcribe_multiple_languages.py @@ -19,34 +31,50 @@ Set the environment variables with your own values before running the sample: 1) AZURE_SPEECH_ENDPOINT - the endpoint to your Speech resource. 2) AZURE_SPEECH_API_KEY - your Speech API key. + +RELATED RESOURCES: + - Fast transcription - Multilingual transcription: + https://learn.microsoft.com/azure/ai-services/speech-service/fast-transcription-create?tabs=multilingual-transcription-on """ import os +import pathlib -def sample_transcribe_multiple_languages(): - # [START transcribe_multiple_languages] +def sample_transcribe_multilingual(): + """Transcribe audio with multilingual content (Preview). + + For multilingual content, do not specify any locales. The service will + automatically detect and transcribe each language segment. + """ + # [START transcribe_multilingual] from azure.core.credentials import AzureKeyCredential from azure.ai.transcription import TranscriptionClient from azure.ai.transcription.models import TranscriptionContent, TranscriptionOptions # Get configuration from environment variables endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] - api_key = os.environ["AZURE_SPEECH_API_KEY"] - # Create the transcription client - client = TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key)) + # We recommend using role-based access control (RBAC) for production scenarios + api_key = os.environ.get("AZURE_SPEECH_API_KEY") + if api_key: + credential = AzureKeyCredential(api_key) + else: + from azure.identity import DefaultAzureCredential - # Path to your audio file with multiple languages - import pathlib + credential = DefaultAzureCredential() + + # Create the transcription client + client = TranscriptionClient(endpoint=endpoint, credential=credential) + # Path to your audio file with multilingual content audio_file_path = pathlib.Path(__file__).parent / "assets" / "audio.wav" # Open and read the audio file with open(audio_file_path, "rb") as audio_file: - # Create transcription options with multiple language candidates - # The service will detect which language is being spoken - options = TranscriptionOptions(locales=["en-US", "es-ES", "fr-FR", "de-DE"]) # Multiple language candidates + # For multilingual content, do NOT specify any locales + # The service will automatically detect and transcribe each language + options = TranscriptionOptions() # Create the request content request_content = TranscriptionContent(definition=options, audio=audio_file) @@ -55,15 +83,12 @@ def sample_transcribe_multiple_languages(): result = client.transcribe(request_content) # Print the transcription result with locale information - print("Transcription with language detection:\n") - if result.phrases: - for phrase in result.phrases: - locale = phrase.locale if hasattr(phrase, "locale") and phrase.locale else "detected" - print(f"[{locale}] {phrase.text}") - else: - print(f"Full transcription: {result.combined_phrases[0].text}") - # [END transcribe_multiple_languages] + print("Multilingual Transcription:\n") + for phrase in result.phrases: + locale = phrase.locale if phrase.locale else "auto-detected" + print(f"[{locale}] {phrase.text}") + # [END transcribe_multilingual] if __name__ == "__main__": - sample_transcribe_multiple_languages() + sample_transcribe_multilingual() diff --git a/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_with_diarization.py b/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_with_diarization.py index 8cf35504bb61..4ffe2c28d133 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_with_diarization.py +++ b/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_with_diarization.py @@ -36,10 +36,18 @@ def sample_transcribe_with_diarization(): # Get configuration from environment variables endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] - api_key = os.environ["AZURE_SPEECH_API_KEY"] + + # We recommend using role-based access control (RBAC) for production scenarios + api_key = os.environ.get("AZURE_SPEECH_API_KEY") + if api_key: + credential = AzureKeyCredential(api_key) + else: + from azure.identity import DefaultAzureCredential + + credential = DefaultAzureCredential() # Create the transcription client - client = TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key)) + client = TranscriptionClient(endpoint=endpoint, credential=credential) # Path to your audio file with multiple speakers import pathlib diff --git a/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_with_enhanced_mode.py b/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_with_enhanced_mode.py index 5eed28c6b5df..1d6b2899580a 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_with_enhanced_mode.py +++ b/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_with_enhanced_mode.py @@ -9,9 +9,29 @@ FILE: sample_transcribe_with_enhanced_mode.py DESCRIPTION: - This sample demonstrates how to transcribe an audio file with enhanced mode enabled. - Enhanced mode provides advanced capabilities such as translation or summarization - during transcription using the Azure AI Transcription client. + This sample demonstrates how to use LLM-powered Enhanced Mode for transcription + and translation using the Azure AI Transcription client. Enhanced Mode uses + LLM-powered speech recognition to provide improved transcription accuracy, + real-time translation, prompt-based customization, and multilingual support + with GPU acceleration. + + Supported Tasks: + +-------------+--------------------------------------------------------------+ + | Task | Description | + +-------------+--------------------------------------------------------------+ + | transcribe | Transcribe audio in the input language (auto-detected or | + | | specified) | + | translate | Translate audio to a specified target language | + +-------------+--------------------------------------------------------------+ + + Limitations: + - `confidence` is not available and always returns 0 + - Word-level timing (offset_milliseconds, duration_milliseconds) is not + supported for the `translate` task + - Diarization is not supported for the `translate` task (only speaker1 + label is returned) + - `locales` and `phrase_lists` options are not required or applicable + with Enhanced Mode USAGE: python sample_transcribe_with_enhanced_mode.py @@ -19,12 +39,24 @@ Set the environment variables with your own values before running the sample: 1) AZURE_SPEECH_ENDPOINT - the endpoint to your Speech resource. 2) AZURE_SPEECH_API_KEY - your Speech API key. + +RELATED RESOURCES: + - LLM speech for speech transcription and translation (preview): + https://learn.microsoft.com/azure/ai-services/speech-service/llm-speech + - Fast transcription: + https://learn.microsoft.com/azure/ai-services/speech-service/fast-transcription-create """ import os +import pathlib def sample_transcribe_with_enhanced_mode(): + """Transcribe audio using Enhanced Mode for improved quality. + + Use Enhanced Mode for improved transcription quality with LLM-powered + speech recognition. + """ # [START transcribe_with_enhanced_mode] from azure.core.credentials import AzureKeyCredential from azure.ai.transcription import TranscriptionClient @@ -36,31 +68,142 @@ def sample_transcribe_with_enhanced_mode(): # Get configuration from environment variables endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] - api_key = os.environ["AZURE_SPEECH_API_KEY"] + + # We recommend using role-based access control (RBAC) for production scenarios + api_key = os.environ.get("AZURE_SPEECH_API_KEY") + if api_key: + credential = AzureKeyCredential(api_key) + else: + from azure.identity import DefaultAzureCredential + + credential = DefaultAzureCredential() # Create the transcription client - client = TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key)) + client = TranscriptionClient(endpoint=endpoint, credential=credential) # Path to your audio file - import pathlib - audio_file_path = pathlib.Path(__file__).parent / "assets" / "audio.wav" # Open and read the audio file with open(audio_file_path, "rb") as audio_file: - # Create enhanced mode properties - # Enable enhanced mode for advanced processing capabilities + # Enhanced mode is automatically enabled when task is specified + enhanced_mode = EnhancedModeProperties(task="transcribe") + + # Create transcription options with enhanced mode + options = TranscriptionOptions(enhanced_mode=enhanced_mode) + + # Create the request content + request_content = TranscriptionContent(definition=options, audio=audio_file) + + # Transcribe the audio with enhanced mode + result = client.transcribe(request_content) + + # Print the transcription result + print(result.combined_phrases[0].text) + # [END transcribe_with_enhanced_mode] + + +def sample_translate_with_enhanced_mode(): + """Translate speech to another language using Enhanced Mode. + + Translate speech to a target language during transcription. Specify the + target language using the language code (e.g., `en` for English, `ko` for + Korean, `es` for Spanish). + """ + # [START translate_with_enhanced_mode] + from azure.core.credentials import AzureKeyCredential + from azure.ai.transcription import TranscriptionClient + from azure.ai.transcription.models import ( + TranscriptionContent, + TranscriptionOptions, + EnhancedModeProperties, + ) + + # Get configuration from environment variables + endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] + + # We recommend using role-based access control (RBAC) for production scenarios + api_key = os.environ.get("AZURE_SPEECH_API_KEY") + if api_key: + credential = AzureKeyCredential(api_key) + else: + from azure.identity import DefaultAzureCredential + + credential = DefaultAzureCredential() + + # Create the transcription client + client = TranscriptionClient(endpoint=endpoint, credential=credential) + + # Path to your audio file (e.g., Chinese audio) + audio_file_path = pathlib.Path(__file__).parent / "assets" / "sample-howstheweather-cn.wav" + + # Open and read the audio file + with open(audio_file_path, "rb") as audio_file: + # Translate Chinese speech to Korean + enhanced_mode = EnhancedModeProperties( + task="translate", + target_language="ko", # Translate to Korean + ) + + # Create transcription options with enhanced mode + options = TranscriptionOptions(enhanced_mode=enhanced_mode) + + # Create the request content + request_content = TranscriptionContent(definition=options, audio=audio_file) + + # Transcribe and translate the audio + result = client.transcribe(request_content) + + # Print the translated result + print("Translated to Korean:") + print(result.combined_phrases[0].text) + # [END translate_with_enhanced_mode] + + +def sample_enhanced_mode_with_prompts(): + """Use prompts to guide output format and improve recognition. + + Provide prompts to improve recognition or control output format. Prompts + are optional text that guides the output style for `transcribe` or + `translate` tasks. + """ + # [START enhanced_mode_with_prompts] + from azure.core.credentials import AzureKeyCredential + from azure.ai.transcription import TranscriptionClient + from azure.ai.transcription.models import ( + TranscriptionContent, + TranscriptionOptions, + EnhancedModeProperties, + ) + + # Get configuration from environment variables + endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] + + # We recommend using role-based access control (RBAC) for production scenarios + api_key = os.environ.get("AZURE_SPEECH_API_KEY") + if api_key: + credential = AzureKeyCredential(api_key) + else: + from azure.identity import DefaultAzureCredential + + credential = DefaultAzureCredential() + + # Create the transcription client + client = TranscriptionClient(endpoint=endpoint, credential=credential) + + # Path to your audio file + audio_file_path = pathlib.Path(__file__).parent / "assets" / "sample-whatstheweatherlike-en.mp3" + + # Open and read the audio file + with open(audio_file_path, "rb") as audio_file: + # Guide output formatting using prompts enhanced_mode = EnhancedModeProperties( - task="translation", # Specify the task type (e.g., "translation", "summarization") - target_language="es-ES", # Target language for translation - prompt=[ - "Translate the following audio to Spanish", - "Focus on technical terminology", - ], # Optional prompts to guide the enhanced mode + task="transcribe", + prompt=["Output must be in lexical format."], ) # Create transcription options with enhanced mode - options = TranscriptionOptions(locales=["en-US"], enhanced_mode=enhanced_mode) + options = TranscriptionOptions(enhanced_mode=enhanced_mode) # Create the request content request_content = TranscriptionContent(definition=options, audio=audio_file) @@ -69,16 +212,96 @@ def sample_transcribe_with_enhanced_mode(): result = client.transcribe(request_content) # Print the transcription result - print("Transcription with enhanced mode:") - print(f"{result.combined_phrases[0].text}") - - # Print individual phrases if available - if result.phrases: - print("\nDetailed phrases:") - for phrase in result.phrases: - print(f" [{phrase.offset_milliseconds}ms]: {phrase.text}") - # [END transcribe_with_enhanced_mode] + print(result.combined_phrases[0].text) + # [END enhanced_mode_with_prompts] + + +def sample_enhanced_mode_with_diarization(): + """Combine Enhanced Mode with diarization and profanity filtering. + + Enhanced Mode can be combined with other transcription options like + `diarization`, `profanity_filter_mode`, and `channels` for comprehensive + transcription scenarios such as meeting transcription. + + Note: Diarization is only supported for the `transcribe` task, not for + `translate`. + """ + # [START enhanced_mode_with_diarization] + from azure.core.credentials import AzureKeyCredential + from azure.ai.transcription import TranscriptionClient + from azure.ai.transcription.models import ( + TranscriptionContent, + TranscriptionOptions, + EnhancedModeProperties, + TranscriptionDiarizationOptions, + ) + + # Get configuration from environment variables + endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] + + # We recommend using role-based access control (RBAC) for production scenarios + api_key = os.environ.get("AZURE_SPEECH_API_KEY") + if api_key: + credential = AzureKeyCredential(api_key) + else: + from azure.identity import DefaultAzureCredential + + credential = DefaultAzureCredential() + + # Create the transcription client + client = TranscriptionClient(endpoint=endpoint, credential=credential) + + # Path to your audio file (e.g., a meeting recording) + audio_file_path = pathlib.Path(__file__).parent / "assets" / "audio.wav" + + # Open and read the audio file + with open(audio_file_path, "rb") as audio_file: + # Configure enhanced mode with prompts + enhanced_mode = EnhancedModeProperties( + task="transcribe", + prompt=["Output must be in lexical format."], + ) + + # Configure diarization to identify different speakers + diarization_options = TranscriptionDiarizationOptions(max_speakers=2) + + # Create transcription options with enhanced mode, diarization, and profanity filter + options = TranscriptionOptions( + enhanced_mode=enhanced_mode, + profanity_filter_mode="Masked", + diarization_options=diarization_options, + ) + + # Create the request content + request_content = TranscriptionContent(definition=options, audio=audio_file) + + # Transcribe the audio with enhanced mode + result = client.transcribe(request_content) + + # Print transcription with speaker information + for phrase in result.phrases: + speaker = phrase.speaker if phrase.speaker is not None else "Unknown" + print(f"[Speaker {speaker}] {phrase.text}") + # [END enhanced_mode_with_diarization] if __name__ == "__main__": + print("=" * 60) + print("Sample 1: Transcribe with Enhanced Mode") + print("=" * 60) sample_transcribe_with_enhanced_mode() + + print("\n" + "=" * 60) + print("Sample 2: Translate with Enhanced Mode") + print("=" * 60) + sample_translate_with_enhanced_mode() + + print("\n" + "=" * 60) + print("Sample 3: Enhanced Mode with Prompt Tuning") + print("=" * 60) + sample_enhanced_mode_with_prompts() + + print("\n" + "=" * 60) + print("Sample 4: Combine Enhanced Mode with Other Options") + print("=" * 60) + sample_enhanced_mode_with_diarization() diff --git a/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_with_phrase_list.py b/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_with_phrase_list.py index d76ed0b72646..f0dc94d06684 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_with_phrase_list.py +++ b/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_with_phrase_list.py @@ -9,8 +9,17 @@ FILE: sample_transcribe_with_phrase_list.py DESCRIPTION: - This sample demonstrates how to transcribe an audio file with a custom phrase list - to improve recognition accuracy for domain-specific terminology using the Azure AI Transcription client. + This sample demonstrates how to use custom phrase lists to improve transcription + accuracy with the Azure AI Transcription client. + + A phrase list allows you to provide domain-specific terms, product names, + technical jargon, or other words that may not be well-recognized by the + default speech model. This improves accuracy for specialized content. + + For example, without a phrase list: + - "Jessie" might be recognized as "Jesse" + - "Rehaan" might be recognized as "everyone" + - "Contoso" might be recognized as "can't do so" USAGE: python sample_transcribe_with_phrase_list.py @@ -21,9 +30,11 @@ """ import os +import pathlib def sample_transcribe_with_phrase_list(): + """Transcribe audio with a custom phrase list to improve recognition accuracy.""" # [START transcribe_with_phrase_list] from azure.core.credentials import AzureKeyCredential from azure.ai.transcription import TranscriptionClient @@ -35,27 +46,32 @@ def sample_transcribe_with_phrase_list(): # Get configuration from environment variables endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] - api_key = os.environ["AZURE_SPEECH_API_KEY"] + + # We recommend using role-based access control (RBAC) for production scenarios + api_key = os.environ.get("AZURE_SPEECH_API_KEY") + if api_key: + credential = AzureKeyCredential(api_key) + else: + from azure.identity import DefaultAzureCredential + + credential = DefaultAzureCredential() # Create the transcription client - client = TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key)) + client = TranscriptionClient(endpoint=endpoint, credential=credential) # Path to your audio file with domain-specific terminology - import pathlib - audio_file_path = pathlib.Path(__file__).parent / "assets" / "audio.wav" # Open and read the audio file with open(audio_file_path, "rb") as audio_file: - # Create a phrase list with custom terminology - # This helps improve recognition accuracy for specific words + # Add custom phrases to improve recognition of names and domain-specific terms + # For example, "Jessie" might be recognized as "Jesse", or "Contoso" as "can't do so" phrase_list = PhraseListProperties( - phrases=["Azure", "Cognitive Services", "Speech SDK", "TranscriptionClient", "Kubernetes", "microservices"], - biasing_weight=5.0, # Weight between 1.0 and 20.0 (higher = more bias) + phrases=["Contoso", "Jessie", "Rehaan"] ) # Create transcription options with phrase list - options = TranscriptionOptions(locales=["en-US"], phrase_list=phrase_list) + options = TranscriptionOptions(phrase_list=phrase_list) # Create the request content request_content = TranscriptionContent(definition=options, audio=audio_file) @@ -65,13 +81,7 @@ def sample_transcribe_with_phrase_list(): # Print the transcription result print("Transcription with custom phrase list:") - print(f"{result.combined_phrases[0].text}") - - # Print individual phrases if available - if result.phrases: - print("\nDetailed phrases:") - for phrase in result.phrases: - print(f" [{phrase.offset_milliseconds}ms]: {phrase.text}") + print(result.combined_phrases[0].text) # [END transcribe_with_phrase_list] diff --git a/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_with_profanity_filter.py b/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_with_profanity_filter.py index 6826139d7061..0018551e6abf 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_with_profanity_filter.py +++ b/sdk/cognitiveservices/azure-ai-transcription/samples/sample_transcribe_with_profanity_filter.py @@ -32,10 +32,18 @@ def sample_transcribe_with_profanity_filter(): # Get configuration from environment variables endpoint = os.environ["AZURE_SPEECH_ENDPOINT"] - api_key = os.environ["AZURE_SPEECH_API_KEY"] + + # We recommend using role-based access control (RBAC) for production scenarios + api_key = os.environ.get("AZURE_SPEECH_API_KEY") + if api_key: + credential = AzureKeyCredential(api_key) + else: + from azure.identity import DefaultAzureCredential + + credential = DefaultAzureCredential() # Create the transcription client - client = TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key)) + client = TranscriptionClient(endpoint=endpoint, credential=credential) # Path to your audio file import pathlib diff --git a/sdk/cognitiveservices/azure-ai-transcription/tests/assets/sample-howstheweather-cn.wav b/sdk/cognitiveservices/azure-ai-transcription/tests/assets/sample-howstheweather-cn.wav new file mode 100644 index 000000000000..65d23513004b Binary files /dev/null and b/sdk/cognitiveservices/azure-ai-transcription/tests/assets/sample-howstheweather-cn.wav differ diff --git a/sdk/cognitiveservices/azure-ai-transcription/tests/conftest.py b/sdk/cognitiveservices/azure-ai-transcription/tests/conftest.py index 2062175edbe0..bff6314d9826 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/tests/conftest.py +++ b/sdk/cognitiveservices/azure-ai-transcription/tests/conftest.py @@ -23,10 +23,7 @@ @pytest.fixture(scope="session") def transcription_endpoint(): """Fixture providing the transcription endpoint.""" - return os.environ.get( - "TRANSCRIPTION_ENDPOINT", - "https://fakeendpoint.cognitiveservices.azure.com" - ) + return os.environ.get("TRANSCRIPTION_ENDPOINT", "https://fakeendpoint.cognitiveservices.azure.com") @pytest.fixture(scope="session") @@ -38,10 +35,7 @@ def transcription_api_key(): @pytest.fixture(scope="session") def transcription_test_audio_url(): """Fixture providing a test audio URL.""" - return os.environ.get( - "TRANSCRIPTION_TEST_AUDIO_URL", - "https://example.com/test-audio.wav" - ) + return os.environ.get("TRANSCRIPTION_TEST_AUDIO_URL", "https://example.com/test-audio.wav") # autouse=True will trigger this fixture on each pytest run, even if it's not explicitly used by a test method @@ -58,38 +52,26 @@ def add_sanitizers(test_proxy): # This allows recordings made with API key auth to work with AAD auth in CI set_custom_default_matcher( excluded_headers="Authorization,Ocp-Apim-Subscription-Key", - ignored_headers="Authorization,Ocp-Apim-Subscription-Key" + ignored_headers="Authorization,Ocp-Apim-Subscription-Key", ) - + # Sanitize subscription and tenant IDs if they exist # Only sanitize if the values are actually set (not default fake values) transcription_subscription_id = os.environ.get("TRANSCRIPTION_SUBSCRIPTION_ID", "") if transcription_subscription_id and transcription_subscription_id != "00000000-0000-0000-0000-000000000000": - add_general_regex_sanitizer( - regex=transcription_subscription_id, - value="00000000-0000-0000-0000-000000000000" - ) - + add_general_regex_sanitizer(regex=transcription_subscription_id, value="00000000-0000-0000-0000-000000000000") + transcription_tenant_id = os.environ.get("TRANSCRIPTION_TENANT_ID", "") if transcription_tenant_id and transcription_tenant_id != "00000000-0000-0000-0000-000000000000": - add_general_regex_sanitizer( - regex=transcription_tenant_id, - value="00000000-0000-0000-0000-000000000000" - ) - + add_general_regex_sanitizer(regex=transcription_tenant_id, value="00000000-0000-0000-0000-000000000000") + transcription_client_id = os.environ.get("TRANSCRIPTION_CLIENT_ID", "") if transcription_client_id and transcription_client_id != "00000000-0000-0000-0000-000000000000": - add_general_regex_sanitizer( - regex=transcription_client_id, - value="00000000-0000-0000-0000-000000000000" - ) - + add_general_regex_sanitizer(regex=transcription_client_id, value="00000000-0000-0000-0000-000000000000") + transcription_client_secret = os.environ.get("TRANSCRIPTION_CLIENT_SECRET", "") if transcription_client_secret and transcription_client_secret != "00000000-0000-0000-0000-000000000000": - add_general_regex_sanitizer( - regex=transcription_client_secret, - value="00000000-0000-0000-0000-000000000000" - ) + add_general_regex_sanitizer(regex=transcription_client_secret, value="00000000-0000-0000-0000-000000000000") # Sanitize endpoint URLs transcription_endpoint = os.environ.get( @@ -97,8 +79,7 @@ def add_sanitizers(test_proxy): ) if transcription_endpoint and "fake" not in transcription_endpoint.lower(): add_general_string_sanitizer( - target=transcription_endpoint, - value="https://fake-transcription-endpoint.cognitiveservices.azure.com/" + target=transcription_endpoint, value="https://fake-transcription-endpoint.cognitiveservices.azure.com/" ) # Sanitize API keys in headers @@ -115,17 +96,18 @@ def add_sanitizers(test_proxy): # Sanitize audio URLs in request/response bodies add_body_key_sanitizer(json_path="$..audioUrl", value="https://fake-audio-url.blob.core.windows.net/audio/test.wav") - add_body_key_sanitizer(json_path="$..audio_url", value="https://fake-audio-url.blob.core.windows.net/audio/test.wav") + add_body_key_sanitizer( + json_path="$..audio_url", value="https://fake-audio-url.blob.core.windows.net/audio/test.wav" + ) # Sanitize storage account names and blob URLs add_uri_regex_sanitizer( - regex=r"https://[a-z0-9]+\.blob\.core\.windows\.net", - value="https://fakeaccount.blob.core.windows.net" + regex=r"https://[a-z0-9]+\.blob\.core\.windows\.net", value="https://fakeaccount.blob.core.windows.net" ) - + # Sanitize cognitive services hostnames to handle different endpoint formats # This handles both api.cognitive.microsoft.com and cognitiveservices.azure.com add_uri_regex_sanitizer( regex=r"https://[^/]+\.(api\.cognitive\.microsoft\.com|cognitiveservices\.azure\.com)", - value="https://Sanitized.cognitiveservices.azure.com" + value="https://Sanitized.cognitiveservices.azure.com", ) diff --git a/sdk/cognitiveservices/azure-ai-transcription/tests/preparer.py b/sdk/cognitiveservices/azure-ai-transcription/tests/preparer.py index 44c2ce052593..c710153e9f90 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/tests/preparer.py +++ b/sdk/cognitiveservices/azure-ai-transcription/tests/preparer.py @@ -17,39 +17,33 @@ class TranscriptionClientTestBase(AzureRecordedTestCase): def create_client(self, endpoint, **kwargs): """Create a synchronous TranscriptionClient for testing.""" # Try to get API key from environment or kwargs - api_key = kwargs.pop('transcription_api_key', os.environ.get('TRANSCRIPTION_API_KEY')) - + api_key = kwargs.pop("transcription_api_key", os.environ.get("TRANSCRIPTION_API_KEY")) + if api_key: # Use API key authentication credential = AzureKeyCredential(api_key) else: # Fall back to default credential credential = self.get_credential(TranscriptionClient) - + return self.create_client_from_credential( - TranscriptionClient, - credential=credential, - endpoint=endpoint, - **kwargs + TranscriptionClient, credential=credential, endpoint=endpoint, **kwargs ) def create_async_client(self, endpoint, **kwargs): """Create an asynchronous TranscriptionClient for testing.""" # Try to get API key from environment or kwargs - api_key = kwargs.pop('transcription_api_key', os.environ.get('TRANSCRIPTION_API_KEY')) - + api_key = kwargs.pop("transcription_api_key", os.environ.get("TRANSCRIPTION_API_KEY")) + if api_key: # Use API key authentication credential = AzureKeyCredential(api_key) else: # Fall back to default credential credential = self.get_credential(AsyncTranscriptionClient, is_async=True) - + return self.create_client_from_credential( - AsyncTranscriptionClient, - credential=credential, - endpoint=endpoint, - **kwargs + AsyncTranscriptionClient, credential=credential, endpoint=endpoint, **kwargs ) @@ -59,5 +53,5 @@ def create_async_client(self, endpoint, **kwargs): "transcription", transcription_endpoint="https://fakeendpoint.cognitiveservices.azure.com", transcription_api_key="fake-api-key", - transcription_test_audio_url="https://example.com/test-audio.wav" + transcription_test_audio_url="https://example.com/test-audio.wav", ) diff --git a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_basic.py b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_basic.py index a6c8f8d51c60..e7975722d09d 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_basic.py +++ b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_basic.py @@ -17,10 +17,10 @@ class TestTranscriptionBasic(TranscriptionClientTestBase): def test_transcribe_url_basic(self, transcription_endpoint, transcription_test_audio_url): """Test basic transcription from a URL.""" client = self.create_client(endpoint=transcription_endpoint) - + audio_url = transcription_test_audio_url result = client.transcribe_from_url(audio_url) - + # Verify response structure assert result is not None assert result.combined_phrases is not None @@ -32,15 +32,12 @@ def test_transcribe_url_basic(self, transcription_endpoint, transcription_test_a def test_transcribe_with_custom_locale(self, transcription_endpoint, transcription_test_audio_url): """Test transcription with a specific locale.""" client = self.create_client(endpoint=transcription_endpoint) - + audio_url = transcription_test_audio_url - options = TranscriptionOptions( - audio_url=audio_url, - locales=["en-US"] - ) - + options = TranscriptionOptions(audio_url=audio_url, locales=["en-US"]) + result = client.transcribe_from_url(audio_url, options=options) - + assert result is not None assert result.combined_phrases is not None assert len(result.combined_phrases) > 0 @@ -50,22 +47,22 @@ def test_transcribe_with_custom_locale(self, transcription_endpoint, transcripti def test_transcribe_result_structure(self, transcription_endpoint, transcription_test_audio_url): """Test that the transcription result has the expected structure.""" client = self.create_client(endpoint=transcription_endpoint) - + audio_url = transcription_test_audio_url result = client.transcribe_from_url(audio_url) - + # Verify result structure assert result is not None - assert hasattr(result, 'combined_phrases') - assert hasattr(result, 'phrases') - assert hasattr(result, 'duration_milliseconds') - + assert hasattr(result, "combined_phrases") + assert hasattr(result, "phrases") + assert hasattr(result, "duration_milliseconds") + # Verify combined_phrases structure assert len(result.combined_phrases) > 0 - assert hasattr(result.combined_phrases[0], 'text') + assert hasattr(result.combined_phrases[0], "text") assert result.combined_phrases[0].text is not None - + # If phrases exist, verify their structure if result.phrases: phrase = result.phrases[0] - assert hasattr(phrase, 'text') + assert hasattr(phrase, "text") diff --git a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_basic_async.py b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_basic_async.py index 2cbf4edba578..f18bd9cbc002 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_basic_async.py +++ b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_basic_async.py @@ -17,12 +17,12 @@ class TestTranscriptionBasicAsync(TranscriptionClientTestBase): async def test_transcribe_url_basic_async(self, transcription_endpoint, transcription_test_audio_url): """Test basic async transcription from a URL.""" client = self.create_async_client(endpoint=transcription_endpoint) - + async with client: audio_url = transcription_test_audio_url - + result = await client.transcribe_from_url(audio_url) - + # Verify response structure assert result is not None assert result.combined_phrases is not None @@ -34,18 +34,15 @@ async def test_transcribe_url_basic_async(self, transcription_endpoint, transcri async def test_transcribe_with_custom_locale_async(self, transcription_endpoint, transcription_test_audio_url): """Test async transcription from URL with custom locale.""" client = self.create_async_client(endpoint=transcription_endpoint) - + async with client: audio_url = transcription_test_audio_url - + # Create transcription options - options = TranscriptionOptions( - audio_url=audio_url, - locales=["en-US"] - ) - + options = TranscriptionOptions(audio_url=audio_url, locales=["en-US"]) + result = await client.transcribe_from_url(audio_url, options=options) - + assert result is not None assert result.combined_phrases is not None assert len(result.combined_phrases) > 0 @@ -56,24 +53,24 @@ async def test_transcribe_with_custom_locale_async(self, transcription_endpoint, async def test_transcribe_result_structure_async(self, transcription_endpoint, transcription_test_audio_url): """Test that async transcription result has expected structure.""" client = self.create_async_client(endpoint=transcription_endpoint) - + async with client: audio_url = transcription_test_audio_url - + result = await client.transcribe_from_url(audio_url) - + # Verify result structure assert result is not None - assert hasattr(result, 'combined_phrases') - assert hasattr(result, 'phrases') - assert hasattr(result, 'duration_milliseconds') - + assert hasattr(result, "combined_phrases") + assert hasattr(result, "phrases") + assert hasattr(result, "duration_milliseconds") + # Verify combined_phrases structure assert len(result.combined_phrases) > 0 - assert hasattr(result.combined_phrases[0], 'text') + assert hasattr(result.combined_phrases[0], "text") assert result.combined_phrases[0].text is not None - + # If phrases exist, verify their structure if result.phrases: phrase = result.phrases[0] - assert hasattr(phrase, 'text') + assert hasattr(phrase, "text") diff --git a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_client_management.py b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_client_management.py index d2227b0eb6f6..049ffc40ac3e 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_client_management.py +++ b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_client_management.py @@ -18,7 +18,7 @@ def test_client_context_manager(self, transcription_endpoint, transcription_test with self.create_client(endpoint=transcription_endpoint) as client: audio_url = transcription_test_audio_url result = client.transcribe_from_url(audio_url) - + assert result is not None assert result.combined_phrases is not None assert len(result.combined_phrases) > 0 @@ -28,11 +28,11 @@ def test_client_context_manager(self, transcription_endpoint, transcription_test def test_client_close(self, transcription_endpoint, transcription_test_audio_url): """Test explicit client close.""" client = self.create_client(endpoint=transcription_endpoint) - + audio_url = transcription_test_audio_url result = client.transcribe_from_url(audio_url) - + assert result is not None - + # Explicitly close the client client.close() diff --git a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_client_management_async.py b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_client_management_async.py index 56556f4d7e14..f27755e8cc29 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_client_management_async.py +++ b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_client_management_async.py @@ -18,8 +18,8 @@ async def test_client_context_manager_async(self, transcription_endpoint, transc # Test creating and using client with context manager async with self.create_async_client(endpoint=transcription_endpoint) as client: audio_url = transcription_test_audio_url - + result = await client.transcribe_from_url(audio_url) - + assert result is not None assert result.combined_phrases is not None diff --git a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_diarization.py b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_diarization.py index 5db09bdcbf18..0820a9cbea36 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_diarization.py +++ b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_diarization.py @@ -17,18 +17,16 @@ class TestTranscriptionDiarization(TranscriptionClientTestBase): def test_transcribe_with_diarization_enabled(self, transcription_endpoint, transcription_test_audio_url): """Test transcription with speaker diarization enabled.""" client = self.create_client(endpoint=transcription_endpoint) - + audio_url = transcription_test_audio_url - + # Enable diarization options = TranscriptionOptions( - audio_url=audio_url, - locales=["en-US"], - diarization_options=TranscriptionDiarizationOptions(max_speakers=2) + audio_url=audio_url, locales=["en-US"], diarization_options=TranscriptionDiarizationOptions(max_speakers=2) ) - + result = client.transcribe_from_url(audio_url, options=options) - + assert result is not None assert result.combined_phrases is not None assert result.phrases is not None diff --git a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_diarization_async.py b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_diarization_async.py index 65ac87621ac5..fe862b88d34a 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_diarization_async.py +++ b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_diarization_async.py @@ -14,23 +14,25 @@ class TestTranscriptionDiarizationAsync(TranscriptionClientTestBase): @TranscriptionPreparer() @recorded_by_proxy_async - async def test_transcribe_with_diarization_enabled_async(self, transcription_endpoint, transcription_test_audio_url): + async def test_transcribe_with_diarization_enabled_async( + self, transcription_endpoint, transcription_test_audio_url + ): """Test async transcription with speaker diarization enabled.""" client = self.create_async_client(endpoint=transcription_endpoint) - + async with client: # For diarization, ideally use multi-speaker audio, but single-speaker works for testing audio_url = transcription_test_audio_url - + # Enable diarization options = TranscriptionOptions( audio_url=audio_url, locales=["en-US"], - diarization_options=TranscriptionDiarizationOptions(max_speakers=2) + diarization_options=TranscriptionDiarizationOptions(max_speakers=2), ) - + result = await client.transcribe_from_url(audio_url, options=options) - + assert result is not None assert result.combined_phrases is not None assert result.phrases is not None diff --git a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_enhanced_mode.py b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_enhanced_mode.py index ca0e72489ccc..391eac7a2d10 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_enhanced_mode.py +++ b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_enhanced_mode.py @@ -17,21 +17,20 @@ class TestTranscriptionEnhancedMode(TranscriptionClientTestBase): def test_transcribe_enhanced_mode_with_prompt(self, transcription_endpoint, transcription_test_audio_url): """Test transcription with enhanced mode and prompt.""" client = self.create_client(endpoint=transcription_endpoint) - + audio_url = transcription_test_audio_url - + # Use enhanced mode with prompts options = TranscriptionOptions( audio_url=audio_url, locales=["en-US"], enhanced_mode=EnhancedModeProperties( - prompt=["This is a technical discussion about Azure services"], - task="transcribe" - ) + prompt=["This is a technical discussion about Azure services"], task="transcribe" + ), ) - + result = client.transcribe_from_url(audio_url, options=options) - + assert result is not None assert result.combined_phrases is not None assert len(result.combined_phrases) > 0 diff --git a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_file.py b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_file.py index d5065bacdfd9..3974dfdf95f8 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_file.py +++ b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_file.py @@ -18,23 +18,20 @@ class TestTranscriptionFile(TranscriptionClientTestBase): def test_transcribe_wav_file(self, transcription_endpoint): """Test transcription from a local WAV file.""" client = self.create_client(endpoint=transcription_endpoint) - + # Path to test audio file test_audio_path = os.path.join(os.path.dirname(__file__), "assets", "audio.wav") - + # Skip test if audio file doesn't exist if not os.path.exists(test_audio_path): pytest.skip(f"Test audio file not found: {test_audio_path}") - + with open(test_audio_path, "rb") as audio_file: # Create transcription content with audio file and options - content = TranscriptionContent( - definition=TranscriptionOptions(locales=["en-US"]), - audio=audio_file - ) - + content = TranscriptionContent(definition=TranscriptionOptions(locales=["en-US"]), audio=audio_file) + result = client.transcribe(body=content) - + assert result is not None assert result.combined_phrases is not None assert len(result.combined_phrases) > 0 diff --git a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_file_async.py b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_file_async.py index 259bde4f9d24..cb31674c9f24 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_file_async.py +++ b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_file_async.py @@ -18,24 +18,21 @@ class TestTranscriptionFileAsync(TranscriptionClientTestBase): async def test_transcribe_wav_file_async(self, transcription_endpoint): """Test async transcription with a local WAV audio file.""" client = self.create_async_client(endpoint=transcription_endpoint) - + async with client: # Path to test audio file test_audio_path = os.path.join(os.path.dirname(__file__), "assets", "audio.wav") - + # Skip test if audio file doesn't exist (for initial setup) if not os.path.exists(test_audio_path): pytest.skip(f"Test audio file not found: {test_audio_path}") - + with open(test_audio_path, "rb") as audio_file: # Create transcription content with audio file and options - content = TranscriptionContent( - definition=TranscriptionOptions(locales=["en-US"]), - audio=audio_file - ) - + content = TranscriptionContent(definition=TranscriptionOptions(locales=["en-US"]), audio=audio_file) + result = await client.transcribe(body=content) - + assert result is not None assert result.combined_phrases is not None assert len(result.combined_phrases) > 0 diff --git a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_options.py b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_options.py index 42fba23dda07..4eabfeefe920 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_options.py +++ b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_options.py @@ -17,16 +17,12 @@ class TestTranscriptionOptions(TranscriptionClientTestBase): def test_transcribe_profanity_filter_raw(self, transcription_endpoint, transcription_test_audio_url): """Test transcription with no profanity filtering.""" client = self.create_client(endpoint=transcription_endpoint) - + audio_url = transcription_test_audio_url - options = TranscriptionOptions( - audio_url=audio_url, - locales=["en-US"], - profanity_filter_mode="None" - ) - + options = TranscriptionOptions(audio_url=audio_url, locales=["en-US"], profanity_filter_mode="None") + result = client.transcribe_from_url(audio_url, options=options) - + assert result is not None assert result.combined_phrases is not None assert len(result.combined_phrases) > 0 @@ -36,16 +32,12 @@ def test_transcribe_profanity_filter_raw(self, transcription_endpoint, transcrip def test_transcribe_profanity_filter_masked(self, transcription_endpoint, transcription_test_audio_url): """Test transcription with profanity masking.""" client = self.create_client(endpoint=transcription_endpoint) - + audio_url = transcription_test_audio_url - options = TranscriptionOptions( - audio_url=audio_url, - locales=["en-US"], - profanity_filter_mode="Masked" - ) - + options = TranscriptionOptions(audio_url=audio_url, locales=["en-US"], profanity_filter_mode="Masked") + result = client.transcribe_from_url(audio_url, options=options) - + assert result is not None assert result.combined_phrases is not None assert result.combined_phrases[0].text is not None @@ -55,16 +47,12 @@ def test_transcribe_profanity_filter_masked(self, transcription_endpoint, transc def test_transcribe_profanity_filter_removed(self, transcription_endpoint, transcription_test_audio_url): """Test transcription with profanity filter set to Removed.""" client = self.create_client(endpoint=transcription_endpoint) - + audio_url = transcription_test_audio_url - options = TranscriptionOptions( - audio_url=audio_url, - locales=["en-US"], - profanity_filter_mode="Removed" - ) - + options = TranscriptionOptions(audio_url=audio_url, locales=["en-US"], profanity_filter_mode="Removed") + result = client.transcribe_from_url(audio_url, options=options) - + assert result is not None assert result.combined_phrases is not None @@ -73,21 +61,18 @@ def test_transcribe_profanity_filter_removed(self, transcription_endpoint, trans def test_transcribe_with_phrase_list(self, transcription_endpoint, transcription_test_audio_url): """Test transcription with a custom phrase list for better recognition.""" client = self.create_client(endpoint=transcription_endpoint) - + audio_url = transcription_test_audio_url - + # Add custom phrases for better recognition options = TranscriptionOptions( audio_url=audio_url, locales=["en-US"], - phrase_list=PhraseListProperties( - phrases=["Azure", "Cognitive Services", "Speech SDK"], - biasing_weight=1.0 - ) + phrase_list=PhraseListProperties(phrases=["Azure", "Cognitive Services", "Speech SDK"], biasing_weight=1.0), ) - + result = client.transcribe_from_url(audio_url, options=options) - + assert result is not None assert result.combined_phrases is not None assert len(result.combined_phrases) > 0 @@ -98,17 +83,14 @@ def test_transcribe_with_phrase_list(self, transcription_endpoint, transcription def test_transcribe_multiple_locales(self, transcription_endpoint, transcription_test_audio_url): """Test transcription with multiple locales specified.""" client = self.create_client(endpoint=transcription_endpoint) - + audio_url = transcription_test_audio_url - + # Specify multiple locales for auto-detection - options = TranscriptionOptions( - audio_url=audio_url, - locales=["en-US", "es-ES", "fr-FR"] - ) - + options = TranscriptionOptions(audio_url=audio_url, locales=["en-US", "es-ES", "fr-FR"]) + result = client.transcribe_from_url(audio_url, options=options) - + assert result is not None assert result.combined_phrases is not None assert len(result.combined_phrases) > 0 diff --git a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_options_async.py b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_options_async.py index e15d5691149a..3b83268da4b1 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_options_async.py +++ b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_options_async.py @@ -17,18 +17,14 @@ class TestTranscriptionOptionsAsync(TranscriptionClientTestBase): async def test_transcribe_profanity_filter_masked_async(self, transcription_endpoint, transcription_test_audio_url): """Test async transcription with masked profanity filter.""" client = self.create_async_client(endpoint=transcription_endpoint) - + async with client: audio_url = transcription_test_audio_url - - options = TranscriptionOptions( - audio_url=audio_url, - locales=["en-US"], - profanity_filter_mode="Masked" - ) - + + options = TranscriptionOptions(audio_url=audio_url, locales=["en-US"], profanity_filter_mode="Masked") + result = await client.transcribe_from_url(audio_url, options=options) - + assert result is not None assert result.combined_phrases is not None assert len(result.combined_phrases) > 0 @@ -38,22 +34,21 @@ async def test_transcribe_profanity_filter_masked_async(self, transcription_endp async def test_transcribe_with_phrase_list_async(self, transcription_endpoint, transcription_test_audio_url): """Test async transcription with custom phrase list.""" client = self.create_async_client(endpoint=transcription_endpoint) - + async with client: audio_url = transcription_test_audio_url - + # Add custom phrases for better recognition options = TranscriptionOptions( audio_url=audio_url, locales=["en-US"], phrase_list=PhraseListProperties( - phrases=["Azure", "Cognitive Services", "Speech SDK"], - biasing_weight=1.0 - ) + phrases=["Azure", "Cognitive Services", "Speech SDK"], biasing_weight=1.0 + ), ) - + result = await client.transcribe_from_url(audio_url, options=options) - + assert result is not None assert result.combined_phrases is not None assert len(result.combined_phrases) > 0 @@ -64,19 +59,16 @@ async def test_transcribe_with_phrase_list_async(self, transcription_endpoint, t async def test_transcribe_multiple_locales_async(self, transcription_endpoint, transcription_test_audio_url): """Test async transcription with multiple language locales.""" client = self.create_async_client(endpoint=transcription_endpoint) - + async with client: # For multi-locale, ideally use multilingual audio, but single language works for testing audio_url = transcription_test_audio_url - + # Specify multiple locales for auto-detection - options = TranscriptionOptions( - audio_url=audio_url, - locales=["en-US", "es-ES", "fr-FR"] - ) - + options = TranscriptionOptions(audio_url=audio_url, locales=["en-US", "es-ES", "fr-FR"]) + result = await client.transcribe_from_url(audio_url, options=options) - + assert result is not None assert result.combined_phrases is not None assert len(result.combined_phrases) > 0 diff --git a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_url.py b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_url.py index 971767fc39be..bccd243bb905 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_url.py +++ b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_url.py @@ -16,10 +16,10 @@ class TestTranscriptionUrl(TranscriptionClientTestBase): def test_transcribe_from_public_url(self, transcription_endpoint, transcription_test_audio_url): """Test transcription from a publicly accessible URL.""" client = self.create_client(endpoint=transcription_endpoint) - + audio_url = transcription_test_audio_url result = client.transcribe_from_url(audio_url) - + assert result is not None assert result.combined_phrases is not None assert len(result.combined_phrases) > 0 diff --git a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_url_async.py b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_url_async.py index d46e53b47aee..ecdda3468b2b 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_url_async.py +++ b/sdk/cognitiveservices/azure-ai-transcription/tests/test_transcription_url_async.py @@ -16,12 +16,12 @@ class TestTranscriptionUrlAsync(TranscriptionClientTestBase): async def test_transcribe_from_public_url_async(self, transcription_endpoint, transcription_test_audio_url): """Test async transcription from a public URL.""" client = self.create_async_client(endpoint=transcription_endpoint) - + async with client: audio_url = transcription_test_audio_url - + result = await client.transcribe_from_url(audio_url) - + assert result is not None assert result.combined_phrases is not None assert len(result.combined_phrases) > 0 diff --git a/sdk/cognitiveservices/azure-ai-transcription/tsp-location.yaml b/sdk/cognitiveservices/azure-ai-transcription/tsp-location.yaml index 845205d658bc..bb0f9e351615 100644 --- a/sdk/cognitiveservices/azure-ai-transcription/tsp-location.yaml +++ b/sdk/cognitiveservices/azure-ai-transcription/tsp-location.yaml @@ -1,4 +1,4 @@ directory: specification/cognitiveservices/Speech.Transcription -commit: 5f88489f000c7471e9517b5f1f27f2fb61aacc50 +commit: 6bd84f27b7a056fc6e916e2e9fefa9fdba1d72d2 repo: Azure/azure-rest-api-specs additionalDirectories: