-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbatch_translate.py
More file actions
123 lines (104 loc) · 4.72 KB
/
batch_translate.py
File metadata and controls
123 lines (104 loc) · 4.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import time
import logging
from tqdm import tqdm
from itertools import islice
from client_utils import get_client, get_response
from tqdm.contrib.logging import logging_redirect_tqdm
# Configure logging to print timestamped info messages
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s"
)
def chunks(iterable, n):
"""
Split an iterable into chunks of size n.
Used to divide subtitles into batches.
"""
iterable = iter(iterable)
while True:
batch = list(islice(iterable, n))
if not batch:
break
yield batch
def build_batch_prompt(subs, source_lang, target_lang):
"""
Build a prompt for batch translation by including all subtitle lines,
while enforcing rules to maintain structure (1:1 line correspondence).
"""
prompt = f"Translate the following {source_lang} subtitles to {target_lang}.\n"
prompt += "Rules:\n- Translate each line individually.\n"
prompt += f"- Return {target_lang} lines in the same order and count.\n"
prompt += f"- No preamble.\n\n"
prompt += f"{source_lang} subtitles:\n"
for idx, sub in enumerate(subs, 1):
# Remove line breaks (\N when parsing with pysub2) inside the subtitle content for better formatting
safe_text = sub.text.replace("\\N", " ").strip()
prompt += f"{idx}. {safe_text}\n"
prompt += f"\n{target_lang} subtitles:\n"
return prompt
def extract_lines_from_response(response):
"""
Parse LLM response into separate translated lines by splitting on newlines
and stripping index numbers like "1. ...".
"""
lines = response.strip().split("\n")
results = []
for line in lines:
if ". " in line:
# Remove the line index (e.g., "1. سلام!")
parts = line.split(". ", 1)
results.append(parts[1].strip())
else:
results.append(line.strip())
return results
def batch_translate(subtitles: list, batch_size: int, source_lang: str, target_lang: str, model_name: str):
"""
Main function to translate subtitles in batches using a specified LLM model.
Args:
subtitles: list of subtitle objects (each with `.content`)
batch_size: number of subtitles per translation batch
source_lang: source language name (e.g., "English")
target_lang: target language name (e.g., "Persian")
model_name: name of the LLM model to use (e.g., "gemini-2.5-flash")
Returns:
A list of subtitle objects with their `.content` translated.
"""
# Validate model selection
if model_name == None:
raise Exception("Specify model name with -m parameter.")
if not subtitles:
return []
dialogue_lines = [s for s in subtitles if s.text.strip()]
total_batches = (len(dialogue_lines) + batch_size - 1) // batch_size # total # of batches
# Get API client instance from utility
client = get_client(model_name)
# Constants for Gemini rate limits
GEMINI_RPM_LIMIT = 15 # Gemini's rate limit: requests per minute
WAIT_TIME_SECONDS = 30 # wait time after hitting rate limit
# captures logs and renders them above the progress bar,
# avoiding the "progress bar on every line" problem
with logging_redirect_tqdm():
# Process each batch
for i, batch in enumerate(tqdm(chunks(subtitles, batch_size), total=total_batches, desc="Translating")):
prompt = build_batch_prompt(batch, source_lang, target_lang)
try:
# Send prompt to LLM model
response_text = get_response(client, model_name, prompt)
# Extract the translated lines
translated_lines = extract_lines_from_response(response_text)
# Ensure response has the same number of lines
if len(translated_lines) != len(batch):
logging.error("Mismatch in lines. Batch index: %d", i)
logging.error("Original: %s", [s.text for s in batch])
logging.error("Translated: %s", translated_lines)
raise ValueError("Mismatch in number of lines returned from LLM.")
# Assign translated text back to subtitle objects
for sub, new_text in zip(batch, translated_lines):
sub.text = new_text
except Exception as e:
logging.exception("Error translating batch %d: %s", i, e)
# Respect LLM API's RPM limit
if i > 0 and i % GEMINI_RPM_LIMIT == 0:
logging.info(f"Waiting {WAIT_TIME_SECONDS} seconds for rate limits...")
time.sleep(WAIT_TIME_SECONDS)
return subtitles