diff --git a/prompts/Gemini_timestamped_transcription_generation_prompt.md b/prompts/Gemini_timestamped_transcription_generation_prompt.md index 1123a08..2e2d8a9 100644 --- a/prompts/Gemini_timestamped_transcription_generation_prompt.md +++ b/prompts/Gemini_timestamped_transcription_generation_prompt.md @@ -1,43 +1,63 @@ -You are a highly accurate audio transcription assistant. Your task is to transcribe audio content in multiple languages for research purposes. You will transcribe audio from an MP3 file and provide a transcript with precise timestamps for each spoken phrase or sentence. - -### Output Example - -[00:00] Hello, how are you? [background music] -[00:05] I'm fine, thank you. [child laughing] - -### Instructions - -1. **Transcription Accuracy** - - - **Dialects & Accents**: Pay close attention to nuances in dialects and accents. Accurately capture regional accents and colloquialisms. - - **Clarity**: Ensure that all spoken words are transcribed clearly, without omissions or additions. - - **Language Mix**: Transcribe all spoken words as heard, even if multiple languages are used. - -2. **Timestamp Granularity** - - - **Phrase/Sentence Start**: Provide timestamps at the beginning of each new phrase or sentence. - - **Long Sentences**: If a sentence is very long, insert timestamps at natural pauses within the sentence, ensuring that no segment exceeds 15 seconds without a timestamp. - -3. **Cultural Sensitivity** - - - **Contextual Understanding**: Be mindful of cultural nuances and contexts that may influence the spoken content. - - **Respectful Representation**: Ensure that the transcription respectfully and accurately represents the speakers' intentions and meanings. - -4. **Output Format** - - - **Consistency**: Follow the exact structure demonstrated in the output example. - - **Timestamps**: Use the `[MM:SS]` format for timestamps at the beginning of each entry, indicating minutes and seconds. - - **Non-Speech Elements**: Use placeholders like `[inaudible]`, `[unclear]`, `[noise]`, `[music]`, etc., to indicate non-speech sounds or unclear speech. - -5. **Quality Assurance** - - - **Review and Proofreading**: Thoroughly review the transcription for accuracy, completeness, and grammatical correctness before finalizing. - - **Accuracy Check**: Verify that the transcription faithfully represents the audio content, including any mixed languages or dialects. - -6. **Additional Guidelines** - - - **Handling Unclear Audio**: If certain parts of the audio are unclear or inaudible, indicate this in the transcription using appropriate placeholders. - ---- - -Please proceed to transcribe the provided audio file following these instructions. It is essential that the transcription is comprehensive, capturing every word and nuance accurately. +You are a highly accurate audio transcription assistant. Your task is to transcribe multiple audio segments provided in a single request. + +## Your Task + +You will receive multiple audio segments (numbered 1 through N). Transcribe each segment accurately and return the transcripts in order. + +## Output Requirements + +Return a JSON object with a `segments` array containing: +- **segment_number**: The segment number (1-indexed, matching the input order) +- **transcript**: The transcribed text for that segment + +## Transcription Guidelines + +1. **Accuracy** + - Capture every spoken word accurately, without omissions or additions + - Preserve the original language(s) as spoken, even if multiple languages are mixed + - Include filler words and false starts when clearly audible + +2. **Non-Speech Elements** + - Use `[inaudible]` for unclear or unintelligible speech + - Use `[unclear]` when speech is partially audible but uncertain + - Use `[music]` for music sections without speech + - Use `[silence]` for segments with no audio content + - Use `[noise]` for background noise that obscures speech + - Use descriptive annotations like `[background music]`, `[applause]`, `[laughter]`, `[coughing]` for sounds occurring alongside speech + +3. **Inline Annotations** + - Non-speech sounds occurring DURING speech should be noted inline + - Example: "Hello, how are you? [background music] I'm doing great today." + +4. **Quality** + - Do not skip any segments - transcribe all provided segments + - Maintain the exact segment numbering from input + - Each transcript should be complete for its segment + - Review for accuracy before finalizing + +## Example Output + +```json +{ + "segments": [ + { + "segment_number": 1, + "transcript": "Good morning everyone, welcome to today's broadcast. [background music]" + }, + { + "segment_number": 2, + "transcript": "We have an exciting show lined up for you today. [applause]" + }, + { + "segment_number": 3, + "transcript": "[music]" + }, + { + "segment_number": 4, + "transcript": "[inaudible] ...and that's why we need to [unclear] the policy." + } + ] +} +``` + +Remember: Transcribe ALL segments provided, maintaining their order and numbering. diff --git a/prompts/Gemini_timestamped_transcription_output_schema.json b/prompts/Gemini_timestamped_transcription_output_schema.json new file mode 100644 index 0000000..296100e --- /dev/null +++ b/prompts/Gemini_timestamped_transcription_output_schema.json @@ -0,0 +1,24 @@ +{ + "type": "object", + "required": ["segments"], + "properties": { + "segments": { + "type": "array", + "description": "Array of transcribed segments in order", + "items": { + "type": "object", + "required": ["segment_number", "transcript"], + "properties": { + "segment_number": { + "type": "integer", + "description": "The segment number (1-indexed, matching the order provided)" + }, + "transcript": { + "type": "string", + "description": "The transcript for this segment." + } + } + } + } + } +} diff --git a/prompts/Gemini_timestamped_transcription_system_instruction.md b/prompts/Gemini_timestamped_transcription_system_instruction.md new file mode 100644 index 0000000..04560e8 --- /dev/null +++ b/prompts/Gemini_timestamped_transcription_system_instruction.md @@ -0,0 +1,25 @@ +You are a specialized language model designed to transcribe audio content in multiple languages with high accuracy. Your primary task is to process segmented audio files and provide precise transcriptions. + +## Core Responsibilities + +1. **Accurate Transcription**: Transcribe each audio segment independently and accurately, capturing every spoken word without omissions or additions. + +2. **Multi-language Support**: Handle audio content in multiple languages, including mixed-language content within the same segment. + +3. **Dialect and Accent Recognition**: Pay close attention to regional dialects, accents, and colloquialisms to ensure accurate representation. + +4. **Cultural Sensitivity**: Maintain respectful and accurate representation of speakers' intentions and cultural contexts. + +## Technical Requirements + +- Process each audio segment independently without considering content from other segments +- Maintain strict accuracy in transcription +- Check for grammatical correctness while preserving the actual spoken content +- Handle unclear audio appropriately with standardized placeholders + +## Output Standards + +- Provide structured JSON output following the specified schema +- Ensure segment numbers match the corresponding audio segments +- Include appropriate placeholders for non-speech elements ([inaudible], [unclear], [noise], [music], etc.) +- Maintain consistency across all segments \ No newline at end of file diff --git a/src/processing_pipeline/stage_1.py b/src/processing_pipeline/stage_1.py index 2214c8f..3744291 100644 --- a/src/processing_pipeline/stage_1.py +++ b/src/processing_pipeline/stage_1.py @@ -4,18 +4,20 @@ import json import boto3 import uuid +import pathlib from google import genai from google.genai.types import ( - File, - FileState, FinishReason, GenerateContentConfig, ThinkingConfig, + Part, ) from openai import OpenAI from prefect.flows import Flow from prefect.client.schemas import FlowRun, State from prefect.task_runners import ConcurrentTaskRunner +from prefect.tasks import exponential_backoff +from pydub import AudioSegment from processing_pipeline.timestamped_transcription_generator import TimestampedTranscriptionGenerator from processing_pipeline.supabase_utils import SupabaseClient from processing_pipeline.constants import ( @@ -145,7 +147,9 @@ def transcribe_audio_file_with_timestamp_with_gemini( audio_file=audio_file, gemini_key=gemini_key, model_name=model_name, - user_prompt=prompt_version["user_prompt"], + prompt_version=prompt_version, + segment_length=20, + batch_size=30, ) return {"timestamped_transcription": timestamped_transcription} @@ -607,53 +611,140 @@ def run( audio_file: str, gemini_key: str, model_name: GeminiModel, - user_prompt: str, - ): + prompt_version: dict, + segment_length: int = 20, + batch_size: int = 30, + ) -> str: if not gemini_key: raise ValueError("Google Gemini API key was not set!") client = genai.Client(api_key=gemini_key) - # Upload the audio file and wait for it to finish processing - uploaded_audio_file = client.files.upload(file=audio_file, config={"mime_type": "audio/mp3"}) - while uploaded_audio_file.state == FileState.PROCESSING: - print("Processing the uploaded audio file...") - time.sleep(1) - uploaded_audio_file = client.files.get(name=uploaded_audio_file.name) + # Split audio into segments + segment_paths = cls.split_audio_into_segments(audio_file, segment_length * 1000) + total_segments = len(segment_paths) + print(f"Split audio into {total_segments} segments of {segment_length}s each") + + all_transcripts = {} # segment_number -> transcript try: - return cls.transcribe(client, uploaded_audio_file, model_name, user_prompt) + for batch_start in range(0, total_segments, batch_size): + batch_end = min(batch_start + batch_size, total_segments) + batch_paths = segment_paths[batch_start:batch_end] + + print(f"Processing batch: segments {batch_start + 1}-{batch_end} of {total_segments}") + + result = cls.transcribe_batch( + client, + batch_paths, + model_name, + prompt_version, + ) + + # Validate segment count + returned_segments = result.get("segments", []) + expected_count = len(batch_paths) + actual_count = len(returned_segments) + if actual_count != expected_count: + raise ValueError( + f"Segment count mismatch: expected {expected_count} segments, " + f"got {actual_count} (batch_start={batch_start})" + ) + + for segment in returned_segments: + segment_num = segment["segment_number"] + if segment_num < 1 or segment_num > expected_count: + raise ValueError( + f"Invalid segment_number {segment_num}: expected range 1-{expected_count} " + f"(batch_start={batch_start})" + ) + absolute_segment_num = batch_start + segment_num + all_transcripts[absolute_segment_num] = segment["transcript"] + + print(f"Batch complete: transcribed {actual_count} segments") + finally: - client.files.delete(name=uploaded_audio_file.name) + for segment_path in segment_paths: + if os.path.exists(segment_path): + os.remove(segment_path) + + return cls.format_final_transcription(all_transcripts, segment_length) - @optional_task(log_prints=True, retries=3) @classmethod - def transcribe( + @optional_task(log_prints=True, retries=3, retry_delay_seconds=exponential_backoff(backoff_factor=2)) + def transcribe_batch( cls, client: genai.Client, - uploaded_audio_file: File, + segment_paths: list, model_name: GeminiModel, - user_prompt: str, + prompt_version: dict, ): + segments = [] + for i, segment_path in enumerate(segment_paths): + segment_num = i + 1 + segments.extend( + [ + f"\n\n", + Part.from_bytes(data=pathlib.Path(segment_path).read_bytes(), mime_type="audio/mp3"), + f"\n\n\n", + ] + ) + thinking_budget = 128 if model_name == GeminiModel.GEMINI_2_5_PRO else 0 result = client.models.generate_content( model=model_name, - contents=[user_prompt, uploaded_audio_file], + contents=[prompt_version["user_prompt"]] + segments, config=GenerateContentConfig( + response_mime_type="application/json", + response_schema=prompt_version["output_schema"], + system_instruction=prompt_version["system_instruction"], max_output_tokens=16384, thinking_config=ThinkingConfig(thinking_budget=thinking_budget), safety_settings=get_safety_settings(), ), ) - if not result.text: + if not result.parsed: finish_reason = result.candidates[0].finish_reason if result.candidates else None - if finish_reason == FinishReason.MAX_TOKENS: raise ValueError("The response from Gemini was too long and was cut off.") + raise ValueError(f"No response from Gemini. Finish reason: {finish_reason}.") - print(f"Response finish reason: {finish_reason}") - raise ValueError("No response from Gemini.") + return result.parsed + + @classmethod + def format_final_transcription(cls, transcripts: dict, segment_length: int) -> str: + result = "" + + for segment_num in sorted(transcripts.keys()): + transcript = transcripts[segment_num] + + total_seconds = (segment_num - 1) * segment_length + minutes = total_seconds // 60 + seconds = total_seconds % 60 + + result += f"[{minutes:02}:{seconds:02}] {transcript}\n" + + return result + + @classmethod + def split_audio_into_segments(cls, audio_file: str, segment_length_ms: int) -> list: + audio = AudioSegment.from_mp3(audio_file) + segments = [] + + audio_length_ms = len(audio) + print(f"Audio duration: {audio_length_ms / 1000:.1f} seconds") + + for i in range(0, audio_length_ms, segment_length_ms): + # Slice the audio segment + subclip = audio[i : min(i + segment_length_ms, audio_length_ms)] + + # Export the subclip + output_file = f"{audio_file}_segment_{(i // segment_length_ms) + 1}.mp3" + subclip.export(output_file, format="mp3") + + segments.append(output_file) - return result.text + del audio + return segments diff --git a/src/scripts/import_prompts_to_db.py b/src/scripts/import_prompts_to_db.py index 0f43db9..349a635 100644 --- a/src/scripts/import_prompts_to_db.py +++ b/src/scripts/import_prompts_to_db.py @@ -39,7 +39,9 @@ "output_schema": "prompts/Stage_4_output_schema.json", }, "gemini_timestamped_transcription": { + "system_instruction": "prompts/Gemini_timestamped_transcription_system_instruction.md", "user_prompt": "prompts/Gemini_timestamped_transcription_generation_prompt.md", + "output_schema": "prompts/Gemini_timestamped_transcription_output_schema.json", }, }