Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,4 @@ workspace/
!workspace/AGENT.md
!workspace/skills/skill-creator
*.private*
workspace/conversations/*
16 changes: 16 additions & 0 deletions base/src/main/java/ai/javaclaw/speech/MockSpeechToTextService.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package ai.javaclaw.speech;

import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.stereotype.Service;

import java.io.InputStream;

@Service
@ConditionalOnProperty(name = "speech.provider", havingValue = "mock", matchIfMissing = true)
public class MockSpeechToTextService implements SpeechToTextService {

@Override
public String transcribe(InputStream audioStream) {
return "[voice message]";
}
}
111 changes: 111 additions & 0 deletions base/src/main/java/ai/javaclaw/speech/OpenAiSpeechToTextService.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package ai.javaclaw.speech;

import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.stereotype.Service;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.util.UUID;

@Service
@ConditionalOnProperty(name = "speech.provider", havingValue = "openai")
public class OpenAiSpeechToTextService implements SpeechToTextService {

private static final Logger LOGGER = LoggerFactory.getLogger(OpenAiSpeechToTextService.class);
private static final JsonFactory JSON_FACTORY = new JsonFactory();

private final HttpClient httpClient;
private final String apiKey;
private final String model;
private final String baseUrl;

public OpenAiSpeechToTextService(
@Value("${spring.ai.openai.api-key}") String apiKey,
@Value("${speech.openai.model:whisper-1}") String model,
@Value("${speech.openai.base-url:https://api.openai.com/v1}") String baseUrl) {
this.apiKey = apiKey;
this.model = model;
this.baseUrl = baseUrl;
this.httpClient = HttpClient.newHttpClient();
}

@Override
public String transcribe(InputStream audioStream) {
LOGGER.info("Transcribing audio via OpenAI Whisper (model: {})", model);

try {
byte[] audioBytes = audioStream.readAllBytes();
String boundary = UUID.randomUUID().toString();
byte[] body = buildMultipartBody(boundary, audioBytes);

HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(baseUrl + "/audio/transcriptions"))
.header("Authorization", "Bearer " + apiKey)
.header("Content-Type", "multipart/form-data; boundary=" + boundary)
.POST(HttpRequest.BodyPublishers.ofByteArray(body))
.build();

HttpResponse<String> response = httpClient.send(request, HttpResponse.BodyHandlers.ofString());

if (response.statusCode() != 200) {
throw new SpeechToTextException("OpenAI API returned status " + response.statusCode() + ": " + response.body());
}

String text = extractTextField(response.body());
if (text == null || text.isBlank()) {
throw new SpeechToTextException("OpenAI returned empty transcription");
}

LOGGER.info("OpenAI transcription completed successfully");
return text.trim();

} catch (IOException | InterruptedException e) {
if (e instanceof InterruptedException) {
Thread.currentThread().interrupt();
}
throw new SpeechToTextException("Failed to call OpenAI transcription API", e);
}
}

private byte[] buildMultipartBody(String boundary, byte[] audioBytes) {
String crlf = "\r\n";

byte[] header = ("--" + boundary + crlf
+ "Content-Disposition: form-data; name=\"file\"; filename=\"voice.ogg\"" + crlf
+ "Content-Type: audio/ogg" + crlf + crlf).getBytes();

byte[] footer = (crlf + "--" + boundary + crlf
+ "Content-Disposition: form-data; name=\"model\"" + crlf + crlf
+ model + crlf
+ "--" + boundary + "--" + crlf).getBytes();

byte[] body = new byte[header.length + audioBytes.length + footer.length];
System.arraycopy(header, 0, body, 0, header.length);
System.arraycopy(audioBytes, 0, body, header.length, audioBytes.length);
System.arraycopy(footer, 0, body, header.length + audioBytes.length, footer.length);

return body;
}

private String extractTextField(String json) throws IOException {
try (JsonParser parser = JSON_FACTORY.createParser(json)) {
while (parser.nextToken() != null) {
if (parser.currentToken() == JsonToken.FIELD_NAME && "text".equals(parser.currentName())) {
parser.nextToken();
return parser.getValueAsString();
}
}
}
return null;
}
}
12 changes: 12 additions & 0 deletions base/src/main/java/ai/javaclaw/speech/SpeechToTextException.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package ai.javaclaw.speech;

public class SpeechToTextException extends RuntimeException {

public SpeechToTextException(String message) {
super(message);
}

public SpeechToTextException(String message, Throwable cause) {
super(message, cause);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package ai.javaclaw.speech;

import java.io.InputStream;

public interface SpeechToTextService {

String transcribe(InputStream audioStream);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
package ai.javaclaw.speech;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.stereotype.Service;

import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.TimeUnit;

@Service
@ConditionalOnProperty(name = "speech.provider", havingValue = "whisper-cpp")
public class WhisperCppSpeechToTextService implements SpeechToTextService {

private static final Logger LOGGER = LoggerFactory.getLogger(WhisperCppSpeechToTextService.class);

private final String modelPath;

public WhisperCppSpeechToTextService(
@Value("${speech.whisper-cpp.model-path}") String modelPath) {
this.modelPath = modelPath;
}

@Override
public String transcribe(InputStream audioStream) {
LOGGER.info("Transcribing audio via whisper-cpp (model: {})", modelPath);

Path oggFile = null;
Path wavFile = null;
Path outputFile = null;

try {
oggFile = Files.createTempFile("whisper-input-", ".ogg");
Files.write(oggFile, audioStream.readAllBytes());

wavFile = Files.createTempFile("whisper-input-", ".wav");
convertOggToWav(oggFile, wavFile);

outputFile = Files.createTempFile("whisper-output-", ".txt");
Files.deleteIfExists(outputFile);

ProcessBuilder pb = new ProcessBuilder(
"whisper-cli",
"-m", modelPath,
"-f", wavFile.toString(),
"-otxt",
"-of", outputFile.toString().replace(".txt", ""),
"--no-prints"
);
pb.redirectErrorStream(true);

Process process = pb.start();
boolean finished = process.waitFor(60, TimeUnit.SECONDS);

if (!finished) {
process.destroyForcibly();
throw new SpeechToTextException("whisper-cli timed out after 60 seconds");
}

if (process.exitValue() != 0) {
String error = new String(process.getInputStream().readAllBytes());
throw new SpeechToTextException("whisper-cli exited with code " + process.exitValue() + ": " + error);
}

if (!Files.exists(outputFile)) {
throw new SpeechToTextException("whisper-cli did not produce output file");
}

String text = Files.readString(outputFile).trim();
if (text.isBlank()) {
throw new SpeechToTextException("whisper-cli returned empty transcription");
}

LOGGER.info("whisper-cpp transcription completed successfully");
return text;

} catch (IOException | InterruptedException e) {
if (e instanceof InterruptedException) {
Thread.currentThread().interrupt();
}
throw new SpeechToTextException("Failed to run whisper-cli", e);
} finally {
deleteSilently(oggFile);
deleteSilently(wavFile);
deleteSilently(outputFile);
}
}

private void convertOggToWav(Path oggFile, Path wavFile) throws IOException, InterruptedException {
ProcessBuilder pb = new ProcessBuilder(
"ffmpeg", "-y", "-i", oggFile.toString(), "-ar", "16000", "-ac", "1", wavFile.toString()
);
pb.redirectErrorStream(true);

Process process = pb.start();
boolean finished = process.waitFor(30, TimeUnit.SECONDS);

if (!finished) {
process.destroyForcibly();
throw new SpeechToTextException("ffmpeg conversion timed out");
}

if (process.exitValue() != 0) {
String error = new String(process.getInputStream().readAllBytes());
throw new SpeechToTextException("ffmpeg conversion failed: " + error);
}
}

private void deleteSilently(Path path) {
if (path != null) {
try {
Files.deleteIfExists(path);
} catch (IOException ignored) {
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@
import ai.javaclaw.channels.Channel;
import ai.javaclaw.channels.ChannelMessageReceivedEvent;
import ai.javaclaw.channels.ChannelRegistry;
import ai.javaclaw.speech.SpeechToTextService;
import org.commonmark.ext.gfm.strikethrough.StrikethroughExtension;
import org.commonmark.node.Node;
import org.commonmark.parser.Parser;
import org.commonmark.renderer.html.HtmlRenderer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.telegram.telegrambots.client.okhttp.OkHttpTelegramClient;
Expand All @@ -17,11 +21,11 @@
import org.telegram.telegrambots.meta.api.objects.message.Message;
import org.telegram.telegrambots.meta.exceptions.TelegramApiException;
import org.telegram.telegrambots.meta.generics.TelegramClient;
import org.commonmark.node.Node;
import org.commonmark.parser.Parser;
import org.commonmark.renderer.html.HtmlRenderer;

import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Optional;

import static java.util.Optional.ofNullable;

Expand All @@ -40,18 +44,26 @@ public class TelegramChannel implements Channel, SpringLongPollingBot, LongPolli
private final TelegramClient telegramClient;
private final Agent agent;
private final ChannelRegistry channelRegistry;
private final SpeechToTextService speechToTextService;
private final TelegramVoiceDownloader voiceDownloader;
private Long chatId;

public TelegramChannel(String botToken, String allowedUsername, Agent agent, ChannelRegistry channelRegistry) {
this(botToken, allowedUsername, new OkHttpTelegramClient(botToken), agent, channelRegistry);
public TelegramChannel(String botToken, String allowedUsername, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService) {
this(botToken, allowedUsername, new OkHttpTelegramClient(botToken), agent, channelRegistry, speechToTextService);
}

TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService) {
this(botToken, allowedUsername, telegramClient, agent, channelRegistry, speechToTextService, new TelegramVoiceDownloader(telegramClient, botToken));
}

TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry) {
TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService, TelegramVoiceDownloader voiceDownloader) {
this.botToken = botToken;
this.allowedUsername = normalizeUsername(allowedUsername);
this.telegramClient = telegramClient;
this.agent = agent;
this.channelRegistry = channelRegistry;
this.speechToTextService = speechToTextService;
this.voiceDownloader = voiceDownloader;
channelRegistry.registerChannel(this);
LOGGER.info("Started Telegram integration");
}
Expand All @@ -68,7 +80,7 @@ public LongPollingUpdateConsumer getUpdatesConsumer() {

@Override
public void consume(Update update) {
if (!(update.hasMessage() && update.getMessage().hasText())) return;
if (!update.hasMessage()) return;

Message requestMessage = update.getMessage();
String userName = requestMessage.getFrom() == null ? null : requestMessage.getFrom().getUserName();
Expand All @@ -78,11 +90,13 @@ public void consume(Update update) {
return;
}

String messageText = requestMessage.getText();
Optional<String> messageText = resolveMessageText(requestMessage);
if (messageText.isEmpty()) return;

this.chatId = requestMessage.getChatId();
Integer messageThreadId = requestMessage.getMessageThreadId();
channelRegistry.publishMessageReceivedEvent(new TelegramChannelMessageReceivedEvent(getName(), messageText, chatId, messageThreadId));
String response = agent.respondTo(getConversationId(chatId, messageThreadId), messageText);
channelRegistry.publishMessageReceivedEvent(new TelegramChannelMessageReceivedEvent(getName(), messageText.get(), chatId, messageThreadId));
String response = agent.respondTo(getConversationId(chatId, messageThreadId), messageText.get());
sendMessage(chatId, messageThreadId, response);
}

Expand Down Expand Up @@ -124,6 +138,28 @@ public void sendMessage(long chatId, Integer messageThreadId, String message) {
}
}

private Optional<String> resolveMessageText(Message message) {
if (message.hasText()) {
return Optional.of(message.getText());
}
if (message.hasVoice()) {
return transcribeVoice(message);
}
return Optional.empty();
}

private Optional<String> transcribeVoice(Message message) {
LOGGER.info("Voice message received, downloading audio");
try (InputStream voiceStream = voiceDownloader.download(message)) {
String transcribed = speechToTextService.transcribe(voiceStream);
LOGGER.info("Voice message transcribed successfully");
return Optional.of(transcribed);
} catch (IOException | TelegramApiException e) {
LOGGER.error("Failed to process voice message", e);
return Optional.empty();
}
}

private String convertMarkdownToTelegramHtml(String markdown) {
if (markdown == null || markdown.isBlank()) return "";

Expand Down Expand Up @@ -181,4 +217,4 @@ public Integer getMessageThreadId() {
return messageThreadId;
}
}
}
}
Loading