diff --git a/.gitignore b/.gitignore index f6fec4a..82f6c08 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,4 @@ workspace/ !workspace/AGENT.md !workspace/skills/skill-creator *.private* +workspace/conversations/* \ No newline at end of file diff --git a/base/src/main/java/ai/javaclaw/speech/MockSpeechToTextService.java b/base/src/main/java/ai/javaclaw/speech/MockSpeechToTextService.java new file mode 100644 index 0000000..8d94106 --- /dev/null +++ b/base/src/main/java/ai/javaclaw/speech/MockSpeechToTextService.java @@ -0,0 +1,16 @@ +package ai.javaclaw.speech; + +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.stereotype.Service; + +import java.io.InputStream; + +@Service +@ConditionalOnProperty(name = "speech.provider", havingValue = "mock", matchIfMissing = true) +public class MockSpeechToTextService implements SpeechToTextService { + + @Override + public String transcribe(InputStream audioStream) { + return "[voice message]"; + } +} diff --git a/base/src/main/java/ai/javaclaw/speech/OpenAiSpeechToTextService.java b/base/src/main/java/ai/javaclaw/speech/OpenAiSpeechToTextService.java new file mode 100644 index 0000000..cb438c0 --- /dev/null +++ b/base/src/main/java/ai/javaclaw/speech/OpenAiSpeechToTextService.java @@ -0,0 +1,111 @@ +package ai.javaclaw.speech; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonToken; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.stereotype.Service; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.util.UUID; + +@Service +@ConditionalOnProperty(name = "speech.provider", havingValue = "openai") +public class OpenAiSpeechToTextService implements SpeechToTextService { + + private static final Logger LOGGER = LoggerFactory.getLogger(OpenAiSpeechToTextService.class); + private static final JsonFactory JSON_FACTORY = new JsonFactory(); + + private final HttpClient httpClient; + private final String apiKey; + private final String model; + private final String baseUrl; + + public OpenAiSpeechToTextService( + @Value("${spring.ai.openai.api-key}") String apiKey, + @Value("${speech.openai.model:whisper-1}") String model, + @Value("${speech.openai.base-url:https://api.openai.com/v1}") String baseUrl) { + this.apiKey = apiKey; + this.model = model; + this.baseUrl = baseUrl; + this.httpClient = HttpClient.newHttpClient(); + } + + @Override + public String transcribe(InputStream audioStream) { + LOGGER.info("Transcribing audio via OpenAI Whisper (model: {})", model); + + try { + byte[] audioBytes = audioStream.readAllBytes(); + String boundary = UUID.randomUUID().toString(); + byte[] body = buildMultipartBody(boundary, audioBytes); + + HttpRequest request = HttpRequest.newBuilder() + .uri(URI.create(baseUrl + "/audio/transcriptions")) + .header("Authorization", "Bearer " + apiKey) + .header("Content-Type", "multipart/form-data; boundary=" + boundary) + .POST(HttpRequest.BodyPublishers.ofByteArray(body)) + .build(); + + HttpResponse response = httpClient.send(request, HttpResponse.BodyHandlers.ofString()); + + if (response.statusCode() != 200) { + throw new SpeechToTextException("OpenAI API returned status " + response.statusCode() + ": " + response.body()); + } + + String text = extractTextField(response.body()); + if (text == null || text.isBlank()) { + throw new SpeechToTextException("OpenAI returned empty transcription"); + } + + LOGGER.info("OpenAI transcription completed successfully"); + return text.trim(); + + } catch (IOException | InterruptedException e) { + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throw new SpeechToTextException("Failed to call OpenAI transcription API", e); + } + } + + private byte[] buildMultipartBody(String boundary, byte[] audioBytes) { + String crlf = "\r\n"; + + byte[] header = ("--" + boundary + crlf + + "Content-Disposition: form-data; name=\"file\"; filename=\"voice.ogg\"" + crlf + + "Content-Type: audio/ogg" + crlf + crlf).getBytes(); + + byte[] footer = (crlf + "--" + boundary + crlf + + "Content-Disposition: form-data; name=\"model\"" + crlf + crlf + + model + crlf + + "--" + boundary + "--" + crlf).getBytes(); + + byte[] body = new byte[header.length + audioBytes.length + footer.length]; + System.arraycopy(header, 0, body, 0, header.length); + System.arraycopy(audioBytes, 0, body, header.length, audioBytes.length); + System.arraycopy(footer, 0, body, header.length + audioBytes.length, footer.length); + + return body; + } + + private String extractTextField(String json) throws IOException { + try (JsonParser parser = JSON_FACTORY.createParser(json)) { + while (parser.nextToken() != null) { + if (parser.currentToken() == JsonToken.FIELD_NAME && "text".equals(parser.currentName())) { + parser.nextToken(); + return parser.getValueAsString(); + } + } + } + return null; + } +} diff --git a/base/src/main/java/ai/javaclaw/speech/SpeechToTextException.java b/base/src/main/java/ai/javaclaw/speech/SpeechToTextException.java new file mode 100644 index 0000000..d5483a4 --- /dev/null +++ b/base/src/main/java/ai/javaclaw/speech/SpeechToTextException.java @@ -0,0 +1,12 @@ +package ai.javaclaw.speech; + +public class SpeechToTextException extends RuntimeException { + + public SpeechToTextException(String message) { + super(message); + } + + public SpeechToTextException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/base/src/main/java/ai/javaclaw/speech/SpeechToTextService.java b/base/src/main/java/ai/javaclaw/speech/SpeechToTextService.java new file mode 100644 index 0000000..ac9f860 --- /dev/null +++ b/base/src/main/java/ai/javaclaw/speech/SpeechToTextService.java @@ -0,0 +1,8 @@ +package ai.javaclaw.speech; + +import java.io.InputStream; + +public interface SpeechToTextService { + + String transcribe(InputStream audioStream); +} diff --git a/base/src/main/java/ai/javaclaw/speech/WhisperCppSpeechToTextService.java b/base/src/main/java/ai/javaclaw/speech/WhisperCppSpeechToTextService.java new file mode 100644 index 0000000..4b795dc --- /dev/null +++ b/base/src/main/java/ai/javaclaw/speech/WhisperCppSpeechToTextService.java @@ -0,0 +1,121 @@ +package ai.javaclaw.speech; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.stereotype.Service; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.concurrent.TimeUnit; + +@Service +@ConditionalOnProperty(name = "speech.provider", havingValue = "whisper-cpp") +public class WhisperCppSpeechToTextService implements SpeechToTextService { + + private static final Logger LOGGER = LoggerFactory.getLogger(WhisperCppSpeechToTextService.class); + + private final String modelPath; + + public WhisperCppSpeechToTextService( + @Value("${speech.whisper-cpp.model-path}") String modelPath) { + this.modelPath = modelPath; + } + + @Override + public String transcribe(InputStream audioStream) { + LOGGER.info("Transcribing audio via whisper-cpp (model: {})", modelPath); + + Path oggFile = null; + Path wavFile = null; + Path outputFile = null; + + try { + oggFile = Files.createTempFile("whisper-input-", ".ogg"); + Files.write(oggFile, audioStream.readAllBytes()); + + wavFile = Files.createTempFile("whisper-input-", ".wav"); + convertOggToWav(oggFile, wavFile); + + outputFile = Files.createTempFile("whisper-output-", ".txt"); + Files.deleteIfExists(outputFile); + + ProcessBuilder pb = new ProcessBuilder( + "whisper-cli", + "-m", modelPath, + "-f", wavFile.toString(), + "-otxt", + "-of", outputFile.toString().replace(".txt", ""), + "--no-prints" + ); + pb.redirectErrorStream(true); + + Process process = pb.start(); + boolean finished = process.waitFor(60, TimeUnit.SECONDS); + + if (!finished) { + process.destroyForcibly(); + throw new SpeechToTextException("whisper-cli timed out after 60 seconds"); + } + + if (process.exitValue() != 0) { + String error = new String(process.getInputStream().readAllBytes()); + throw new SpeechToTextException("whisper-cli exited with code " + process.exitValue() + ": " + error); + } + + if (!Files.exists(outputFile)) { + throw new SpeechToTextException("whisper-cli did not produce output file"); + } + + String text = Files.readString(outputFile).trim(); + if (text.isBlank()) { + throw new SpeechToTextException("whisper-cli returned empty transcription"); + } + + LOGGER.info("whisper-cpp transcription completed successfully"); + return text; + + } catch (IOException | InterruptedException e) { + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throw new SpeechToTextException("Failed to run whisper-cli", e); + } finally { + deleteSilently(oggFile); + deleteSilently(wavFile); + deleteSilently(outputFile); + } + } + + private void convertOggToWav(Path oggFile, Path wavFile) throws IOException, InterruptedException { + ProcessBuilder pb = new ProcessBuilder( + "ffmpeg", "-y", "-i", oggFile.toString(), "-ar", "16000", "-ac", "1", wavFile.toString() + ); + pb.redirectErrorStream(true); + + Process process = pb.start(); + boolean finished = process.waitFor(30, TimeUnit.SECONDS); + + if (!finished) { + process.destroyForcibly(); + throw new SpeechToTextException("ffmpeg conversion timed out"); + } + + if (process.exitValue() != 0) { + String error = new String(process.getInputStream().readAllBytes()); + throw new SpeechToTextException("ffmpeg conversion failed: " + error); + } + } + + private void deleteSilently(Path path) { + if (path != null) { + try { + Files.deleteIfExists(path); + } catch (IOException ignored) { + } + } + } +} diff --git a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannel.java b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannel.java index 018c427..0579842 100644 --- a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannel.java +++ b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannel.java @@ -4,7 +4,11 @@ import ai.javaclaw.channels.Channel; import ai.javaclaw.channels.ChannelMessageReceivedEvent; import ai.javaclaw.channels.ChannelRegistry; +import ai.javaclaw.speech.SpeechToTextService; import org.commonmark.ext.gfm.strikethrough.StrikethroughExtension; +import org.commonmark.node.Node; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.telegram.telegrambots.client.okhttp.OkHttpTelegramClient; @@ -17,11 +21,11 @@ import org.telegram.telegrambots.meta.api.objects.message.Message; import org.telegram.telegrambots.meta.exceptions.TelegramApiException; import org.telegram.telegrambots.meta.generics.TelegramClient; -import org.commonmark.node.Node; -import org.commonmark.parser.Parser; -import org.commonmark.renderer.html.HtmlRenderer; +import java.io.IOException; +import java.io.InputStream; import java.util.List; +import java.util.Optional; import static java.util.Optional.ofNullable; @@ -40,18 +44,26 @@ public class TelegramChannel implements Channel, SpringLongPollingBot, LongPolli private final TelegramClient telegramClient; private final Agent agent; private final ChannelRegistry channelRegistry; + private final SpeechToTextService speechToTextService; + private final TelegramVoiceDownloader voiceDownloader; private Long chatId; - public TelegramChannel(String botToken, String allowedUsername, Agent agent, ChannelRegistry channelRegistry) { - this(botToken, allowedUsername, new OkHttpTelegramClient(botToken), agent, channelRegistry); + public TelegramChannel(String botToken, String allowedUsername, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService) { + this(botToken, allowedUsername, new OkHttpTelegramClient(botToken), agent, channelRegistry, speechToTextService); + } + + TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService) { + this(botToken, allowedUsername, telegramClient, agent, channelRegistry, speechToTextService, new TelegramVoiceDownloader(telegramClient, botToken)); } - TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry) { + TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService, TelegramVoiceDownloader voiceDownloader) { this.botToken = botToken; this.allowedUsername = normalizeUsername(allowedUsername); this.telegramClient = telegramClient; this.agent = agent; this.channelRegistry = channelRegistry; + this.speechToTextService = speechToTextService; + this.voiceDownloader = voiceDownloader; channelRegistry.registerChannel(this); LOGGER.info("Started Telegram integration"); } @@ -68,7 +80,7 @@ public LongPollingUpdateConsumer getUpdatesConsumer() { @Override public void consume(Update update) { - if (!(update.hasMessage() && update.getMessage().hasText())) return; + if (!update.hasMessage()) return; Message requestMessage = update.getMessage(); String userName = requestMessage.getFrom() == null ? null : requestMessage.getFrom().getUserName(); @@ -78,11 +90,13 @@ public void consume(Update update) { return; } - String messageText = requestMessage.getText(); + Optional messageText = resolveMessageText(requestMessage); + if (messageText.isEmpty()) return; + this.chatId = requestMessage.getChatId(); Integer messageThreadId = requestMessage.getMessageThreadId(); - channelRegistry.publishMessageReceivedEvent(new TelegramChannelMessageReceivedEvent(getName(), messageText, chatId, messageThreadId)); - String response = agent.respondTo(getConversationId(chatId, messageThreadId), messageText); + channelRegistry.publishMessageReceivedEvent(new TelegramChannelMessageReceivedEvent(getName(), messageText.get(), chatId, messageThreadId)); + String response = agent.respondTo(getConversationId(chatId, messageThreadId), messageText.get()); sendMessage(chatId, messageThreadId, response); } @@ -124,6 +138,28 @@ public void sendMessage(long chatId, Integer messageThreadId, String message) { } } + private Optional resolveMessageText(Message message) { + if (message.hasText()) { + return Optional.of(message.getText()); + } + if (message.hasVoice()) { + return transcribeVoice(message); + } + return Optional.empty(); + } + + private Optional transcribeVoice(Message message) { + LOGGER.info("Voice message received, downloading audio"); + try (InputStream voiceStream = voiceDownloader.download(message)) { + String transcribed = speechToTextService.transcribe(voiceStream); + LOGGER.info("Voice message transcribed successfully"); + return Optional.of(transcribed); + } catch (IOException | TelegramApiException e) { + LOGGER.error("Failed to process voice message", e); + return Optional.empty(); + } + } + private String convertMarkdownToTelegramHtml(String markdown) { if (markdown == null || markdown.isBlank()) return ""; @@ -181,4 +217,4 @@ public Integer getMessageThreadId() { return messageThreadId; } } -} \ No newline at end of file +} diff --git a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannelAutoConfiguration.java b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannelAutoConfiguration.java index 76bb8c4..a007c61 100644 --- a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannelAutoConfiguration.java +++ b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannelAutoConfiguration.java @@ -3,6 +3,7 @@ import ai.javaclaw.agent.Agent; import ai.javaclaw.channels.ChannelRegistry; +import ai.javaclaw.speech.SpeechToTextService; import org.springframework.beans.factory.annotation.Value; import org.springframework.boot.autoconfigure.AutoConfiguration; import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean; @@ -19,7 +20,8 @@ public class TelegramChannelAutoConfiguration { public TelegramChannel telegramChannel(@Value("${agent.channels.telegram.token:null}") String botToken, @Value("${agent.channels.telegram.username:null}") String allowedUsername, Agent agent, - ChannelRegistry channelRegistry) { - return new TelegramChannel(botToken, allowedUsername, agent, channelRegistry); + ChannelRegistry channelRegistry, + SpeechToTextService speechToTextService) { + return new TelegramChannel(botToken, allowedUsername, agent, channelRegistry, speechToTextService); } } diff --git a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramVoiceDownloader.java b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramVoiceDownloader.java new file mode 100644 index 0000000..5ab8a04 --- /dev/null +++ b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramVoiceDownloader.java @@ -0,0 +1,29 @@ +package ai.javaclaw.channels.telegram; + +import org.telegram.telegrambots.meta.api.methods.GetFile; +import org.telegram.telegrambots.meta.api.objects.message.Message; +import org.telegram.telegrambots.meta.exceptions.TelegramApiException; +import org.telegram.telegrambots.meta.generics.TelegramClient; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; + +class TelegramVoiceDownloader { + + private final TelegramClient telegramClient; + private final String botToken; + + TelegramVoiceDownloader(TelegramClient telegramClient, String botToken) { + this.telegramClient = telegramClient; + this.botToken = botToken; + } + + InputStream download(Message message) throws TelegramApiException, IOException { + String fileId = message.getVoice().getFileId(); + GetFile getFile = new GetFile(fileId); + String filePath = telegramClient.execute(getFile).getFilePath(); + String fileUrl = "https://api.telegram.org/file/bot" + botToken + "/" + filePath; + return URI.create(fileUrl).toURL().openStream(); + } +} diff --git a/plugins/telegram/src/test/java/ai/javaclaw/channels/telegram/TelegramChannelTest.java b/plugins/telegram/src/test/java/ai/javaclaw/channels/telegram/TelegramChannelTest.java index a8d0378..d688eee 100644 --- a/plugins/telegram/src/test/java/ai/javaclaw/channels/telegram/TelegramChannelTest.java +++ b/plugins/telegram/src/test/java/ai/javaclaw/channels/telegram/TelegramChannelTest.java @@ -2,6 +2,7 @@ import ai.javaclaw.agent.Agent; import ai.javaclaw.channels.ChannelRegistry; +import ai.javaclaw.speech.SpeechToTextService; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; @@ -32,6 +33,9 @@ class TelegramChannelTest { @Mock private Agent agent; + @Mock + private SpeechToTextService speechToTextService; + // ----------------------------------------------------------------------- // Ignored updates // ----------------------------------------------------------------------- @@ -48,13 +52,17 @@ void ignoresUpdatesWithoutMessage() { } @Test - void ignoresUpdatesWithoutText() { + void ignoresUpdatesWithoutTextOrVoice() { TelegramChannel channel = channel("allowed_user"); Update update = mock(Update.class); Message message = mock(Message.class); + User user = mock(User.class); when(update.hasMessage()).thenReturn(true); when(update.getMessage()).thenReturn(message); + when(message.getFrom()).thenReturn(user); + when(user.getUserName()).thenReturn("allowed_user"); when(message.hasText()).thenReturn(false); + when(message.hasVoice()).thenReturn(false); channel.consume(update); @@ -68,7 +76,6 @@ void ignoresMessagesFromNullUsername() { Message message = mock(Message.class); when(update.hasMessage()).thenReturn(true); when(update.getMessage()).thenReturn(message); - when(message.hasText()).thenReturn(true); when(message.getFrom()).thenReturn(null); channel.consume(update); @@ -234,7 +241,7 @@ void sendMessageFallbacksToSendingRawTextWhenFailingToSendHtml() throws Telegram // ----------------------------------------------------------------------- private TelegramChannel channel(String allowedUsername) { - return new TelegramChannel("token", allowedUsername, telegramClient, agent, new ChannelRegistry()); + return new TelegramChannel("token", allowedUsername, telegramClient, agent, new ChannelRegistry(), speechToTextService); } private Update updateFromUnknownUser(String username) { @@ -243,7 +250,6 @@ private Update updateFromUnknownUser(String username) { User user = mock(User.class); when(update.hasMessage()).thenReturn(true); when(update.getMessage()).thenReturn(message); - when(message.hasText()).thenReturn(true); when(message.getFrom()).thenReturn(user); when(user.getUserName()).thenReturn(username); return update;