From 5e182d3ed95759aa8866229439a2e0fc91a30caa Mon Sep 17 00:00:00 2001 From: Aditya Date: Fri, 13 Sep 2024 00:26:24 +0530 Subject: [PATCH 1/2] basic working + deepgram --- src/audio.py | 37 ++++++++++++---------- src/llm.py | 80 ++++++++++++++++++++++++++++++++++++++++-------- src/prompts.py | 43 ++++++++++++++++++++++++++ src/simple_ui.py | 27 ++++++++++------ 4 files changed, 148 insertions(+), 39 deletions(-) create mode 100644 src/prompts.py diff --git a/src/audio.py b/src/audio.py index 523656c..91cd952 100644 --- a/src/audio.py +++ b/src/audio.py @@ -1,13 +1,10 @@ -"""Audio utilities.""" import numpy as np -import soundcard as sc +import sounddevice as sd import soundfile as sf from loguru import logger from src.constants import OUTPUT_FILE_NAME, RECORD_SEC, SAMPLE_RATE -SPEAKER_ID = str(sc.default_speaker().name) - def record_batch(record_sec: int = RECORD_SEC) -> np.ndarray: """ @@ -18,20 +15,26 @@ def record_batch(record_sec: int = RECORD_SEC) -> np.ndarray: Returns: np.ndarray: The recorded audio sample. - - Example: - ```python - audio_sample = record_batch(5) - print(audio_sample) - ``` """ - logger.debug("Recording for {record_sec} second(s)...") - with sc.get_microphone( - id=SPEAKER_ID, - include_loopback=True, - ).recorder(samplerate=SAMPLE_RATE) as mic: - audio_sample = mic.record(numframes=SAMPLE_RATE * record_sec) - return audio_sample + logger.debug(f"Recording for {record_sec} second(s)...") + + # Get the default input device (should work with your MacBook Air Microphone) + device_info = sd.query_devices(kind='input') + channels = device_info['max_input_channels'] + + if channels == 0: + logger.error("No available input channels. Please check your microphone settings.") + return np.array([]) + + try: + # Record using sounddevice + audio_sample = sd.rec(int(record_sec * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=channels) + sd.wait() # Wait until the recording is finished + logger.debug("Recording complete.") + return audio_sample + except Exception as e: + logger.error(f"Recording failed: {e}") + return np.array([]) # Return an empty array on failure def save_audio_file(audio_data: np.ndarray, output_file_name: str = OUTPUT_FILE_NAME) -> None: diff --git a/src/llm.py b/src/llm.py index 4d41cde..05503a1 100644 --- a/src/llm.py +++ b/src/llm.py @@ -1,19 +1,36 @@ +import time import openai +from deepgram import DeepgramClient from loguru import logger -from src.constants import INTERVIEW_POSTION, OPENAI_API_KEY, OUTPUT_FILE_NAME +from src.constants import DEEPGRAM_API_KEY, OPENAI_API_KEY, OUTPUT_FILE_NAME +from src.prompts import SYSTEM_PROMPT openai.api_key = OPENAI_API_KEY -SYSTEM_PROMPT = f"""You are interviewing for a {INTERVIEW_POSTION} position. -You will receive an audio transcription of the question. It may not be complete. You need to understand the question and write an answer to it.\n -""" -SHORTER_INSTRACT = "Concisely respond, limiting your answer to 70 words." -LONGER_INSTRACT = ( + + +# SHORTER_INSTRACT = "Concisely respond, limiting your answer to 70 words." + +LONGER_INSTRUCT = ( "Before answering, take a deep breath and think one step at a time. Believe the answer in no more than 150 words." ) +def log_time(func): + """ + Decorator to measure and log the execution time of a function. + """ + def wrapper(*args, **kwargs): + start_time = time.time() + result = func(*args, **kwargs) + end_time = time.time() + logger.info(f"Executing {func.__name__}: {end_time - start_time:.4f} seconds") + return result + return wrapper + + +@log_time def transcribe_audio(path_to_file: str = OUTPUT_FILE_NAME) -> str: """ Transcribes an audio file into text. @@ -36,7 +53,41 @@ def transcribe_audio(path_to_file: str = OUTPUT_FILE_NAME) -> str: return transcript["text"] -def generate_answer(transcript: str, short_answer: bool = True, temperature: float = 0.7) -> str: +@log_time +def transcribe_audio_deepgram(path_to_file: str = OUTPUT_FILE_NAME) -> str: + """ + Transcribes an audio file into text using Deepgram. + + Args: + path_to_file (str): The path to the audio file to be transcribed. + + Returns: + str: The transcribed text. + + Raises: + Exception: If the audio file fails to transcribe. + """ + with open(path_to_file, "rb") as audio_file: + audio_data = audio_file.read() + + try: + dp = DeepgramClient(api_key=DEEPGRAM_API_KEY) + response = dp.listen.rest.v("1").transcribe_file({ + 'buffer': audio_data, + 'mimetype': 'audio/wav' + }, { + 'punctuate': True, + 'diarize': True + }) + transcript = response['results']['channels'][0]['alternatives'][0]['transcript'] + except Exception as error: + logger.error(f"Can't transcribe audio: {error}") + raise error + + return transcript + + +def generate_answer(transcript: str, history: str, short_answer: bool = True, temperature: float = 0.7) -> str: """ Generates an answer based on the given transcript using the OpenAI GPT-3.5-turbo model. @@ -44,7 +95,7 @@ def generate_answer(transcript: str, short_answer: bool = True, temperature: flo transcript (str): The transcript to generate an answer from. short_answer (bool): Whether to generate a short answer or not. Defaults to True. temperature (float): The temperature parameter for controlling the randomness of the generated answer. - + history (str): conversation history Returns: str: The generated answer. @@ -58,10 +109,15 @@ def generate_answer(transcript: str, short_answer: bool = True, temperature: flo Raises: Exception: If the LLM fails to generate an answer. """ - if short_answer: - system_prompt = SYSTEM_PROMPT + SHORTER_INSTRACT - else: - system_prompt = SYSTEM_PROMPT + LONGER_INSTRACT + # if short_answer: + # system_prompt = SYSTEM_PROMPT + SHORTER_INSTRACT + # else: + + system_prompt = SYSTEM_PROMPT + LONGER_INSTRUCT + + if history: + system_prompt += f"\nconversation history: \n {history}" + try: response = openai.ChatCompletion.create( model="gpt-3.5-turbo", diff --git a/src/prompts.py b/src/prompts.py new file mode 100644 index 0000000..acfbf1c --- /dev/null +++ b/src/prompts.py @@ -0,0 +1,43 @@ +SYSTEM_PROMPT = f""" +You are a service agent for Avoca Air Conditioning. You will receive an audio transcription of the customer's question, which might be incomplete. +Your task is to understand the question and respond according to the following guidelines: + +TONE: Be confident, warm, and approachable. Keep the language varied and concise, as you're communicating over the phone. + +Response if they’re not looking for service: +Kindly ask them to leave a message, letting them know that an agent will contact them by the next business day. + +Information to collect (Ask strictly one thing at a time): +Problem or issue they're facing +Age of their system +Name +Address +Callback number +Email + +Once all the information is extracted prompt confirm it from the user, and only after the explicit confirmation schedule the call + +Service Titan Job Scheduling: (only if and when all the information is available) +Schedule the appointment as unassigned for the next business day morning. Tell them: "We’ve got you scheduled for the next business day. +A dispatcher will reach out in the morning to confirm the exact time. We don’t provide service on weekends." + +Commonly Asked Questions: +When is the earliest I can schedule? +"The soonest we can schedule is the day after tomorrow. For example, right now it’s Thursday, February 22nd, 12:35 PM, so the first available slot is Monday morning. However, an agent can call between 7:30 AM and 8:30 AM tomorrow." + +What are your hours? +"We're open 8 AM to 5 PM, Monday through Friday." + +When can I speak to a live agent? +"The earliest you can talk to someone is between 7:30 and 8:30 AM tomorrow." + +What time will the technician arrive? +"We provide open time frames, and our dispatcher will keep you updated throughout the day." + +Is there a service fee? +"The diagnostic fee is $79 unless you’re looking to replace your system, in which case we offer a free quote." + +Last Line: +Thank you for giving us the opportunity to earn your business. One of our agents will contact you to confirm your appointment. + +""" \ No newline at end of file diff --git a/src/simple_ui.py b/src/simple_ui.py index 57f6208..7f0f99d 100644 --- a/src/simple_ui.py +++ b/src/simple_ui.py @@ -5,6 +5,8 @@ from src import audio, llm from src.constants import APPLICATION_WIDTH, OFF_IMAGE, ON_IMAGE +history = "AI: Thank you for calling Dooley Service Pro, this is Sarah your virtual assistant how may I help you today!" + def get_text_area(text: str, size: tuple) -> sg.Text: """ @@ -85,26 +87,31 @@ def background_recording_loop() -> None: elif event in ("a", "A"): # send audio to OpenAI Whisper model logger.debug("Analyzing audio...") analyzed_text_label.update("Start analyzing...") - WINDOW.perform_long_operation(llm.transcribe_audio, "-WHISPER COMPLETED-") + WINDOW.perform_long_operation(llm.transcribe_audio_deepgram, "-WHISPER COMPLETED-") elif event == "-WHISPER COMPLETED-": audio_transcript = values["-WHISPER COMPLETED-"] analyzed_text_label.update(audio_transcript) # Generate quick answer: - quick_chat_gpt_answer.update("Chatgpt is working...") - WINDOW.perform_long_operation( - lambda: llm.generate_answer(audio_transcript, short_answer=True, temperature=0), - "-CHAT_GPT SHORT ANSWER-", - ) + # quick_chat_gpt_answer.update("Chatgpt is working...") + # WINDOW.perform_long_operation( + # lambda: llm.generate_answer(audio_transcript, short_answer=True, temperature=0, history=history), + # "-CHAT_GPT SHORT ANSWER-", + # ) # Generate full answer: full_chat_gpt_answer.update("Chatgpt is working...") WINDOW.perform_long_operation( - lambda: llm.generate_answer(audio_transcript, short_answer=False, temperature=0.7), - "-CHAT_GPT LONG ANSWER-", + lambda: llm.generate_answer(audio_transcript, short_answer=False, temperature=0.7, history=history), + "-CHAT_GPT LONG ANSWER-" ) - elif event == "-CHAT_GPT SHORT ANSWER-": - quick_chat_gpt_answer.update(values["-CHAT_GPT SHORT ANSWER-"]) + history += f"\nUSER: {values['-WHISPER COMPLETED-']}" + + # elif event == '-CHAT_GPT SHORT ANSWER-': + # history += f'\nAI: {values["-CHAT_GPT SHORT ANSWER-"]}' + # quick_chat_gpt_answer.update(values["-CHAT_GPT SHORT ANSWER-"]) + elif event == "-CHAT_GPT LONG ANSWER-": + history += f'\nAI: {values["-CHAT_GPT LONG ANSWER-"]}' full_chat_gpt_answer.update(values["-CHAT_GPT LONG ANSWER-"]) From ccb37d15b91daa63e033907cf1fcd1a9bc5cb0b2 Mon Sep 17 00:00:00 2001 From: Aditya Date: Mon, 16 Sep 2024 13:11:18 +0530 Subject: [PATCH 2/2] streaming voice --- .gitignore | 2 +- src/audio.py | 58 --------- src/constants.py | 3 +- src/llm.py | 133 --------------------- src/main.py | 300 +++++++++++++++++++++++++++++++++++++++++++++++ src/prompts.py | 56 +++++---- src/simple_ui.py | 117 ------------------ 7 files changed, 330 insertions(+), 339 deletions(-) delete mode 100644 src/audio.py delete mode 100644 src/llm.py create mode 100644 src/main.py delete mode 100644 src/simple_ui.py diff --git a/.gitignore b/.gitignore index 68bc17f..fb6436e 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,7 @@ __pycache__/ *.py[cod] *$py.class - +.idea/ # C extensions *.so diff --git a/src/audio.py b/src/audio.py deleted file mode 100644 index 91cd952..0000000 --- a/src/audio.py +++ /dev/null @@ -1,58 +0,0 @@ -import numpy as np -import sounddevice as sd -import soundfile as sf -from loguru import logger - -from src.constants import OUTPUT_FILE_NAME, RECORD_SEC, SAMPLE_RATE - - -def record_batch(record_sec: int = RECORD_SEC) -> np.ndarray: - """ - Records an audio batch for a specified duration. - - Args: - record_sec (int): The duration of the recording in seconds. Defaults to the value of RECORD_SEC. - - Returns: - np.ndarray: The recorded audio sample. - """ - logger.debug(f"Recording for {record_sec} second(s)...") - - # Get the default input device (should work with your MacBook Air Microphone) - device_info = sd.query_devices(kind='input') - channels = device_info['max_input_channels'] - - if channels == 0: - logger.error("No available input channels. Please check your microphone settings.") - return np.array([]) - - try: - # Record using sounddevice - audio_sample = sd.rec(int(record_sec * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=channels) - sd.wait() # Wait until the recording is finished - logger.debug("Recording complete.") - return audio_sample - except Exception as e: - logger.error(f"Recording failed: {e}") - return np.array([]) # Return an empty array on failure - - -def save_audio_file(audio_data: np.ndarray, output_file_name: str = OUTPUT_FILE_NAME) -> None: - """ - Saves an audio data array to a file. - - Args: - audio_data (np.ndarray): The audio data to be saved. - output_file_name (str): The name of the output file. Defaults to the value of OUTPUT_FILE_NAME. - - Returns: - None - - Example: - ```python - audio_data = np.array([0.1, 0.2, 0.3]) - save_audio_file(audio_data, "output.wav") - ``` - """ - logger.debug(f"Saving audio file to {output_file_name}...") - sf.write(file=output_file_name, data=audio_data, samplerate=SAMPLE_RATE) diff --git a/src/constants.py b/src/constants.py index ac128a0..c96fb45 100644 --- a/src/constants.py +++ b/src/constants.py @@ -1,10 +1,11 @@ INTERVIEW_POSTION = "python developer" OPENAI_API_KEY = "" +DEEPGRAM_API_KEY = "" OUTPUT_FILE_NAME = "out.wav" # audio file name. SAMPLE_RATE = 48000 # [Hz]. sampling rate. -RECORD_SEC = 1 # [sec]. duration recording audio. +RECORD_SEC = 2 # [sec]. duration recording audio. APPLICATION_WIDTH = 85 OFF_IMAGE = b"iVBORw0KGgoAAAANSUhEUgAAAGQAAAAoCAYAAAAIeF9DAAAPpElEQVRoge1b63MUVRY//Zo3eQHyMBEU5LVYpbxdKosQIbAqoFBraclatZ922Q9bW5b/gvpBa10+6K6WftFyxSpfaAmCEUIEFRTRAkQFFQkkJJghmcm8uqd763e6b+dOZyYJktoiskeb9OP2ne7zu+d3Hve2smvXLhqpKIpCmqaRruu1hmGsCoVCdxiGMc8wjNmapiUURalGm2tQeh3HSTuO802xWDxhmmaraZotpmkmC4UCWZZFxWKRHMcZVjMjAkQAEQqFmiORyJ+j0ei6UCgUNgyDz6uqym3Edi0KlC0227YBQN40zV2FQuHZbDa7O5fLOQBnOGCGBQTKNgzj9lgs9s9EIrE4EomQAOJaVf5IBYoHAKZpHs7lcn9rbm7+OAjGCy+8UHKsD9W3ruuRSCTyVCKR+Es8HlfC4bAPRF9fHx0/fpx+/PFH6unp4WOYJkbHtWApwhowYHVdp6qqKqqrq6Pp06fTvHnzqLq6mnWAa5qmLTYM48DevXuf7e/vf+Suu+7KVep3kIWsXbuW/7a0tDREo9Ed1dXVt8bjcbYK/MB3331HbW1t1N7eTgAIFoMfxSZTF3lU92sUMcplisJgxJbL5Sifz1N9fT01NjbSzTffXAKiaZpH+/v7169Zs+Yszr344oslFFbWQlpaWubGYrH3a2pqGmKxGCv74sWL9Pbbb1NnZyclEgmaNGmST13kUVsJ0h4wOB8EaixLkHIEKKAmAQx8BRhj+/btNHnyZNqwYQNNnDiR398wjFsTicSBDz74oPnOO+/8Gro1TbOyhWiaVh+Pxz+ura3FXwbj8OHDtHv3bgI448aNYyCg5Ouvv55mzJjBf2traykajXIf2WyWaQxWdOrUKTp//rww3V+N75GtRBaA4lkCA5NKpSiTydDq1atpyZIlfkvLstr7+/tvTyaT+MuAUhAQVVUjsVgMYABFVvzOnTvp888/Z34EIDgHjly6dCmfc3vBk4leFPd/jBwo3nHo559/pgMfHaATX59ApFZCb2NJKkVH5cARwAAUKBwDdOHChbRu3Tq/DegrnU4DlBxAwz3aQw895KpRUaCsp6urq9fDQUHxsIojR47QhAkTCNYCAO677z5acNttFI3FyCGHilaRUqk0myi2/nSaRwRMV9c1UhWFYrEozZo9mx3eyW9OMscGqexq3IJS7hlJOk+S3xTnvLyNB+L333/P4MycOVMYwGRN02pt234PwHFAJCxE1/Vl48aNO1hXV6fAEj777DPCteuuu44d9w033EDr16/3aQlKv3TpEv8tHS6exXiCvmpqaigWj5NCDqXT/bT9tdfoYnc39yWs5WqXcr6j0rHwK/I+KAy66u7upubmZlq8eLG47mQymeU9PT0fg95UD00lFAptSyQSHNrCgcM6xo8fz2DceOONtHnTJt4v2kXq7LxAHR0d7CvYccujRlNIwchX3WO06ejopM6ODrKsIgP0xy1bGGhhSRgZV7sELaNcRBnclzcwDt4dLAPdAhih+3A4/A8wEKyIAdE0bU0kEuGkDyaGaAo3YwMod999NyvZtCx20JlMf8lDkaK6ICgq8X/sRrxj1QUMwJw/D1BMvu8P99/PYTPCRAHI1Uxf5aLESvQ1FChQPPQKHQvRNG1pNBpdDf2rHl2hHMI3nD592g9tcdy8ppl03eCR3N3VxT5D5n9331U6/2XLUEv2Fe9vsWjRha5uKloWhUMGbdiwnjkVPkVEGWPNUoLnKJB/BdvACqBb6Bg5nbhmGMZWpnBVVWpDodDvw+EQO+H9+/fzDbhx9uzZTC2OU6Te3l5Wms/3AV9R8tCOe9FRSps4pJBdtCh56RKHyfX1DTRnzhx2dgAf/mQ0Iy9ky0jMFi1aVHL+k08+YWWAs4WibrnlFlq+fPmQ/bW2ttJPP/1EW7ZsGbLdiRMn2P/KdT74EfFbYAboGAn2rFlu4qjrGjCoVVVVawqFQiHDCHG0hNwBSKGjhYsWckf5XJ5yHBkJK3AtwPcVgq48y1A0lVRN8Y5Vv72GB1I1DgXzuRw5tsPZLHwJnJ5cdrnSbdq0afTAAw8MAgOybNkyVuqUKVN8yxxJJRa0i204wful0+lBVEwD1sA6hq77+lI8eBVFBQZNqqZpvxMZ97Fjxxg9HONhq6uq2IlnsjkXaU/xLlVppLHCNRck35m759FO0zyHrwpwNB8kvJjt2DS+bjxn/fAloMWRKGY4gWXI8X4luffee5kJ8LsjEQyakVArgEBbYRWyyNQFXUPnQoCFrmnafFwEICgUohEU1tDQQLbtlQXsImmqihyPFMWjI4bbIdUBFam8r5CbCJLi0pU79AjunRzVvU/1ruPFsOHhkO0fOnRoIFu9QtpasGCBv//DDz/Qu+++S2fOnOF3RMSIeh1yIggS3D179pQMhMcee4yTWVEWEgI9wfKEwDHv27dvUPUBx3DecjgvrguQ0Aa6xvMJqgQWuqqqMwXP4SHA4xCMWlGbwYh3exXde0onDwQSICnAhc+riuIn74yh15oR5HMqjyIEDPUN9cynIgS+0rxEKBuOc9u2bczXSG5h+QgiXn31VXrwwQc5t4KffOutt0pCb7QTpaCgUhEJyccoJUH5QfBEqUi0C1q+qBIjg5f6m6Fjlk84H/AekjgcV1VXk+Ol/6Cjih5ciOfkub2iuqA4A5Yi4GMsaaCtYxdpwvgJPh1cKWWBrjCSIaADhJg4J49YKB/hOwCBgnFdBuTRRx8d1O/JkyfZksSAhSBRxiYLAoXnn3/eD1AqvY+okCeTSd96VFWtASBVgtegFNFJyNDdhwTlqKXoO/6oH8BpiKDLvY5+yjSwHcdNOD0KG80kEX5KTBHIIxj7YAMhSNaG+12E5hiwsJyhBP0gIsXAFgOjkgidCwEWuhzNyOk+/Af8BUdRnqpLaojSUen5YSTQGC8gttFw6HIfsI5KRUxQspCuri6aOnXqkP1isCB6Gu4ZOSq9zLxKfj7dcZw+x3Gq0BG4U/wgRhfMXCR//s3Sv25hl52GDw1T0zAIKS5zMSUWbZsLkqMlGJ1QCCwD1dUDBw6UHf1w7hBEdwBEVsrjjz8+yKmDXuCL5HZw6shNhFMXDhu+J+hTyonQuRBgoXsrJqpwDlVesUIC3BaJRlh7hqaxB/B8OXk+2hvtiqi4+2gzpqoHkIi6PJ5TvAQRlFfwKOpCV9eoluORaM6dO5dp4+GHH+aKNWpvUBIsA5EVSkLkRWHBAieOca/s1EVkFHTyACno1L11CEM+o5hhRFAgRWCXdNu2TxWLxQaghYdEZIJ9/J00eTKRbZIaCZPDilcGrMJz0H6465kEY6EKvDwa5PkRhfy4S3HbF7MWJ4ciJA2+8C8RvBzmbwAIBGGqHKoGZceOHX6oLysa5wTlyRIsi4iioezsg/Mj5WhORLCYUZTuO606jnNMOFPkAzB37KNE4BRdSsEmlKX5SR6SQdU77yaFqtfGTQA1r6blZvAaZ/AaX1M4D7FdJ+7Y9O2335aMUnlJzS/ZEOm8+eabw8KJFR9ggmB4e7kSLL3L7yCfl6/h3aHrm266yffhtm0fV23b3i8mR+bPn8+NgBx4NZnsYZ7PZtxMHQBwJq55ZRKpNKJ5inYVrvrZO498v42bteNcNpsjx7G5DI0QFCNytOZG8Bznzp2j5557jvbu3TvoOsrfTzzxBE8vI+TFCB8pXVZSMlUAo9IcPJeP8nmuoQmxbbsVlNViWVbBsqwQHg4ZOhwjlHPkiy9oxR13kJ3P880iKWKK4mxcJHkeiSkDeYbrLRQ/ifTDAcWhXD5Hhby7EqZ1XyuHh6JaUO4lfomgLzwz1gOgYArnLSIfXMO7iOQPx0ePHuUAALOeGBTwIeWeBZNyTz75pF9shd8dDozgOYS6CJqga+l3gEELoiwsd3wvn89vxMOtXLmSXn75ZR6xKKXM6ezkim9vX68/Hy78uVISbXl+Y8C1uDgEEhVMUvVe6iWbHDrXfo6OHT/GeYBY8zVagJBUwkDfcp1M8dZLydVlgCCmIMjL1is9B/oT+YjwfZXAKAeMyGk2btzotykWi8Agyfxgmua/gBiQmzVrFq8iwTFuRljHcTXTWDfPaah+kVHMhahSAdGt6mr+vIjq+ReVR1R3dxf3hQryG2+84U+EyRYyWiJCdvSN3wA4YoKIZ+ekyE6uwoqp5XI0JqItWJhYxXk5YIhKMPIelG1owGqegc4ZENu2d+fz+cNi9m7Tpk0MiEASnGuaFs/2dXRcoGwmw5EUNkVUc0maPfRnEL3pTkXhEjumcTHraBaLXE/CbyBslOP2K3Xo/4tNVra8lQNA3jDgUUuDLjZv3iw780PZbHYP9K0hTvc6OKYoyp9CoZDCixJiMfrqq694FKATOF6Ej7AAHMMpozDII01xfUq5OQwoHY4bnIsySSFf4AVkyAvgs8DBQ43Iq0VGa5EDEk5MiUvW4eTz+ft7e3vP4roMSLvjOBN1XV8CM4TyoUxM6YIzAQJm2VA1TcQTbDHpVIp9S8Es8LFYHIb7+nr7qKu7i3r7+tgqIOfOtdMrr/yHHaMMxtW6eC44+iu1Ce4PBQYWyzU1NfnXsTo+lUr9G8EE1xI//PBDv0NVVaPxePwgFsqJFYrvvPMOT3lCeeBcOEdUSRcvXkS1NdJCOZIrjAOFeeyjxNzW9hFXTGF5oClBVWNlGRCNwkI5VAjuuecevw0WyqVSqd8mk8ks2vCMqQwIuWUDfykplAaFARAAA/qCtXhL7KmurpamT5tOU6ZiKalbagAUuWyOkj1JOtt+1l80IRxr0ImPFTCCUinPKLeUFMoGTWHqWAiWknqrFnkpqZi1HATIqlWrMFk0Nx6P82Jrsb4XieLrr7/O88CinO0MfP8wqGKrDHzk409Xim2sLiWly1hsDdoW0RSCJFFdRlvLss729/c3NzY2fo3gRi7Bl139joZtbW3LHcfZYds2f46AXGTr1q1MO8h+kaNAsZVWi/gZvLeUUvGmbRFJ4IHHsgR9RPBzBGzwwcgzsKpGBq9QKOBzhI0rVqw4Q16RUZaKH+w0Njae3b9//+22bT9lWZb/wQ6iA/wIoqYvv/ySK6siivLXp5aJtsYqNVUSAYao7MLHYmEIyvooQckTWZ4F4ZO2Z9Pp9CNNTU05+ZosZSkrKAcPHsQnbU/H4/ElYgX8/z9pG14kSj+UyWT+vnLlyoNBAF566aWS4xEBIuTTTz/Fcse/RqPRteFwOCy+ExHglFtuea2IHCJ7/qRgmubOfD7/jPfRpz+TOFQYPQiQoUQ4asMw8Fk0FtitCIVCv9F1nT+LVlW16hoFJOU4Tsq2bXwWfdyyrNZCodBSKBSScNgjXsBBRP8FGptkKVwR+ZoAAAAASUVORK5CYII=" diff --git a/src/llm.py b/src/llm.py deleted file mode 100644 index 05503a1..0000000 --- a/src/llm.py +++ /dev/null @@ -1,133 +0,0 @@ -import time -import openai -from deepgram import DeepgramClient -from loguru import logger - -from src.constants import DEEPGRAM_API_KEY, OPENAI_API_KEY, OUTPUT_FILE_NAME -from src.prompts import SYSTEM_PROMPT - -openai.api_key = OPENAI_API_KEY - - - -# SHORTER_INSTRACT = "Concisely respond, limiting your answer to 70 words." - -LONGER_INSTRUCT = ( - "Before answering, take a deep breath and think one step at a time. Believe the answer in no more than 150 words." -) - - -def log_time(func): - """ - Decorator to measure and log the execution time of a function. - """ - def wrapper(*args, **kwargs): - start_time = time.time() - result = func(*args, **kwargs) - end_time = time.time() - logger.info(f"Executing {func.__name__}: {end_time - start_time:.4f} seconds") - return result - return wrapper - - -@log_time -def transcribe_audio(path_to_file: str = OUTPUT_FILE_NAME) -> str: - """ - Transcribes an audio file into text. - - Args: - path_to_file (str, optional): The path to the audio file to be transcribed. - - Returns: - str: The transcribed text. - - Raises: - Exception: If the audio file fails to transcribe. - """ - with open(path_to_file, "rb") as audio_file: - try: - transcript = openai.Audio.translate("whisper-1", audio_file) - except Exception as error: - logger.error(f"Can't transcribe audio: {error}") - raise error - return transcript["text"] - - -@log_time -def transcribe_audio_deepgram(path_to_file: str = OUTPUT_FILE_NAME) -> str: - """ - Transcribes an audio file into text using Deepgram. - - Args: - path_to_file (str): The path to the audio file to be transcribed. - - Returns: - str: The transcribed text. - - Raises: - Exception: If the audio file fails to transcribe. - """ - with open(path_to_file, "rb") as audio_file: - audio_data = audio_file.read() - - try: - dp = DeepgramClient(api_key=DEEPGRAM_API_KEY) - response = dp.listen.rest.v("1").transcribe_file({ - 'buffer': audio_data, - 'mimetype': 'audio/wav' - }, { - 'punctuate': True, - 'diarize': True - }) - transcript = response['results']['channels'][0]['alternatives'][0]['transcript'] - except Exception as error: - logger.error(f"Can't transcribe audio: {error}") - raise error - - return transcript - - -def generate_answer(transcript: str, history: str, short_answer: bool = True, temperature: float = 0.7) -> str: - """ - Generates an answer based on the given transcript using the OpenAI GPT-3.5-turbo model. - - Args: - transcript (str): The transcript to generate an answer from. - short_answer (bool): Whether to generate a short answer or not. Defaults to True. - temperature (float): The temperature parameter for controlling the randomness of the generated answer. - history (str): conversation history - Returns: - str: The generated answer. - - Example: - ```python - transcript = "Can you tell me about the weather?" - answer = generate_answer(transcript, short_answer=False, temperature=0.8) - print(answer) - ``` - - Raises: - Exception: If the LLM fails to generate an answer. - """ - # if short_answer: - # system_prompt = SYSTEM_PROMPT + SHORTER_INSTRACT - # else: - - system_prompt = SYSTEM_PROMPT + LONGER_INSTRUCT - - if history: - system_prompt += f"\nconversation history: \n {history}" - - try: - response = openai.ChatCompletion.create( - model="gpt-3.5-turbo", - temperature=temperature, - messages=[ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": transcript}, - ], - ) - except Exception as error: - logger.error(f"Can't generate answer: {error}") - raise error - return response["choices"][0]["message"]["content"] diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..3a81194 --- /dev/null +++ b/src/main.py @@ -0,0 +1,300 @@ +import PySimpleGUI as sg +import pyaudio +import threading +import websockets +import json +import logging +import asyncio +import time +import queue +import openai +from src.constants import DEEPGRAM_API_KEY, OPENAI_API_KEY +from src.prompts import SYSTEM_PROMPT + +# Set up logging +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + +# Audio configuration +CHUNK = 8192 +FORMAT = pyaudio.paInt16 +CHANNELS = 1 +RATE = 16000 + +# Deepgram WebSocket configuration +WS_URL = "wss://api.deepgram.com/v1/listen?encoding=linear16&sample_rate=16000&channels=1" +WS_HEADER = {"Authorization": f"Token {DEEPGRAM_API_KEY}"} + +# Keep-alive configuration +KEEP_ALIVE_INTERVAL = 5 # seconds +openai.api_key = OPENAI_API_KEY + + +async def websocket_handler(audio_queue, window): + """ + Handles the WebSocket connection to Deepgram, sending audio data and receiving transcriptions. + """ + reconnect_delay = 1 + while True: + try: + async with websockets.connect(WS_URL, extra_headers=WS_HEADER) as ws: + logger.info("Connected to Deepgram WebSocket.") + reconnect_delay = 1 # Reset reconnect delay on successful connection + state = {'last_audio_time': time.time()} + + # Create tasks for sending audio, receiving messages, and keep-alive + send_task = asyncio.create_task(send_audio(ws, audio_queue, state)) + receive_task = asyncio.create_task(receive_messages(ws, window)) + keep_alive_task = asyncio.create_task(send_keep_alive(ws, state)) + + # Wait for any task to complete (e.g., due to an exception) + done, pending = await asyncio.wait( + [send_task, receive_task, keep_alive_task], + return_when=asyncio.FIRST_EXCEPTION + ) + + # Cancel all pending tasks + for task in pending: + task.cancel() + + except websockets.exceptions.InvalidStatusCode as e: + logger.error(f"Invalid status code: {e.status_code}") + await asyncio.sleep(reconnect_delay) + reconnect_delay = min(reconnect_delay * 2, 60) + except websockets.exceptions.WebSocketException as e: + logger.error(f"WebSocket exception: {e}") + await asyncio.sleep(reconnect_delay) + reconnect_delay = min(reconnect_delay * 2, 60) + except Exception as e: + logger.error(f"Unexpected exception: {e}") + await asyncio.sleep(reconnect_delay) + reconnect_delay = min(reconnect_delay * 2, 60) + else: + # If the connection closes normally, reset the reconnect delay + reconnect_delay = 1 + + logger.debug("WebSocket handler terminated.") + + +async def send_audio(ws, audio_queue, state): + """ + Sends audio data from the queue to the WebSocket. + """ + while True: + audio_data = await asyncio.to_thread(audio_queue.get) + if audio_data is None: + logger.debug("Received stop signal for sending audio.") + break + try: + await ws.send(audio_data) + state['last_audio_time'] = time.time() + logger.debug("Sent audio data to Deepgram.") + except websockets.exceptions.WebSocketException as e: + logger.error(f"WebSocket exception while sending audio: {e}") + break + + +async def receive_messages(ws, window): + """ + Receives transcription messages from the WebSocket and updates the GUI. + """ + while True: + try: + response = await ws.recv() + response_json = json.loads(response) + if 'channel' in response_json and 'alternatives' in response_json['channel']: + transcription = response_json['channel']['alternatives'][0]['transcript'] + if transcription: + window.write_event_value("-TRANSCRIPT-", transcription) + logger.debug(f"Received transcription: {transcription}") + except websockets.exceptions.WebSocketException as e: + logger.error(f"WebSocket exception while receiving messages: {e}") + break + except Exception as e: + logger.error(f"Exception while receiving messages: {e}") + break + + +async def send_keep_alive(ws, state, keep_alive_interval=KEEP_ALIVE_INTERVAL): + """ + Sends keep-alive messages to maintain the WebSocket connection. + """ + while True: + await asyncio.sleep(keep_alive_interval) + if time.time() - state['last_audio_time'] > keep_alive_interval: + try: + await ws.send(json.dumps({"type": "KeepAlive"})) + logger.debug("Sent keep-alive message.") + except websockets.exceptions.WebSocketException as e: + logger.error(f"WebSocket exception while sending keep-alive: {e}") + break + + +class AudioRecorder: + """ + Handles audio recording using PyAudio and sends audio data to a queue. + """ + def __init__(self, audio_queue): + self.audio_queue = audio_queue + self.p = None + self.stream = None + self.is_recording = False + + def start(self): + if self.is_recording: + logger.warning("Audio recording is already in progress.") + return + self.p = pyaudio.PyAudio() + try: + self.stream = self.p.open(format=FORMAT, + channels=CHANNELS, + rate=RATE, + input=True, + frames_per_buffer=CHUNK, + stream_callback=self.callback) + except Exception as e: + logger.error(f"Failed to open audio stream: {e}") + self.p.terminate() + self.p = None + return + self.stream.start_stream() + self.is_recording = True + logger.debug("Audio recording started.") + + def callback(self, in_data, frame_count, time_info, status): + if self.is_recording: + self.audio_queue.put(in_data) + return (None, pyaudio.paContinue) + + def stop(self): + if not self.is_recording: + logger.warning("Audio recording is not active.") + return + self.is_recording = False + if self.stream: + self.stream.stop_stream() + self.stream.close() + self.stream = None + if self.p: + self.p.terminate() + self.p = None + # Signal the WebSocket handler to stop by sending None + self.audio_queue.put(None) + logger.debug("Audio recording stopped.") + + +def start_event_loop(loop, audio_queue, window): + """ + Starts the asyncio event loop. + """ + asyncio.set_event_loop(loop) + loop.run_until_complete(websocket_handler(audio_queue, window)) + + +def gen_llm_answer(transcript: str, window, history: str, temperature: float = 0.7) -> str: + system_prompt = SYSTEM_PROMPT + + if history: + system_prompt += f"\nconversation history: \n {history}" + + try: + response = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + temperature=temperature, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": transcript}, + ], + ) + except Exception as error: + logger.error(f"Can't generate answer: {error}") + raise error + + rv = response["choices"][0]["message"]["content"] + window.write_event_value("-LLM_ANSWER-", rv) + + return rv + + +def main(): + # Define the GUI layout + layout = [ + [sg.Text("Real-time Audio Transcription", font=("Helvetica", 20))], + [sg.Button("Start Recording", key="-RECORD-")], + [sg.Text("Transcript:", font=("Helvetica", 16))], + [sg.Multiline(size=(60, 10), key="-TRANSCRIPT-", disabled=True, autoscroll=True)], + [sg.Multiline(size=(60, 10), key="-LLM_ANSWER-", disabled=True, autoscroll=True)], + [sg.Button("Exit")] + ] + + # Create the window + window = sg.Window("Audio Transcription App", layout, finalize=True) + + # Create the audio queue + audio_queue = queue.Queue() + + # Create the AudioRecorder + recorder = AudioRecorder(audio_queue) + + # Create the asyncio event loop + loop = asyncio.new_event_loop() + + # Start the event loop in a separate daemon thread + loop_thread = threading.Thread(target=start_event_loop, args=(loop, audio_queue, window), daemon=True) + loop_thread.start() + + # Initialize recording state and transcript + recording = False + transcript = "" + llm_answer = "" + + history = "" + + while True: + event, values = window.read(timeout=100) + if event in (sg.WIN_CLOSED, "Exit"): + logger.debug("Exit event triggered. Closing application.") + if recording: + recorder.stop() + # Stop the asyncio event loop + loop.call_soon_threadsafe(loop.stop) + loop_thread.join() + break + + if event == "-RECORD-": + if not recording: + # Start recording + recorder.start() + recording = True + window["-RECORD-"].update("Stop Recording") + logger.debug("Recording started via GUI.") + else: + # Stop recording + recorder.stop() + recording = False + window["-RECORD-"].update("Start Recording") + logger.debug("Recording stopped via GUI.") + + if event == "-TRANSCRIPT-": + # Append new transcription to the existing transcript + transcription = values["-TRANSCRIPT-"] + + llm_thread = threading.Thread(target=gen_llm_answer, args=(transcription, window, history)) + llm_thread.start() + + transcript += transcription + "\n" + window["-TRANSCRIPT-"].update(transcript) + history += f'USER: {transcription}' + + if event == "-LLM_ANSWER-": + # Append new transcription to the existing transcript + generated_answer = values["-LLM_ANSWER-"] + history += f'AI: {generated_answer}' + llm_answer += generated_answer + "\n" + window["-LLM_ANSWER-"].update(llm_answer) + + window.close() + + +if __name__ == "__main__": + main() diff --git a/src/prompts.py b/src/prompts.py index acfbf1c..917b852 100644 --- a/src/prompts.py +++ b/src/prompts.py @@ -1,43 +1,41 @@ SYSTEM_PROMPT = f""" -You are a service agent for Avoca Air Conditioning. You will receive an audio transcription of the customer's question, which might be incomplete. -Your task is to understand the question and respond according to the following guidelines: +You are a Sam a sales agent for Avoca Air Conditioning company. +You will receive an audio transcription of the question. It may not be complete. You need to understand the question and write an answer to it based on the following script: \n -TONE: Be confident, warm, and approachable. Keep the language varied and concise, as you're communicating over the phone. +Complete the previous answer first. -Response if they’re not looking for service: -Kindly ask them to leave a message, letting them know that an agent will contact them by the next business day. +#####TONE###### +Confident but fun and warm. You should vary your language so you're never saying the same thing over and over again. Be very concise since you're talking over the phone. +############### -Information to collect (Ask strictly one thing at a time): -Problem or issue they're facing +(If not looking for service): +Just ask them to leave a message and tell them an agent will be in the next business day or before. + +Information to collect (Collect them one by one): +Problem / issue they are facing Age of their system Name Address -Callback number +Callback Number Email -Once all the information is extracted prompt confirm it from the user, and only after the explicit confirmation schedule the call +Service Titan Job Scheduling: +Schedule as unassigned for following day morning +Say “we got you on the books for the next business day, a dispatcher will reach out to you in the morning to confirm the exact time. We don't provide service on the weekends." -Service Titan Job Scheduling: (only if and when all the information is available) -Schedule the appointment as unassigned for the next business day morning. Tell them: "We’ve got you scheduled for the next business day. -A dispatcher will reach out in the morning to confirm the exact time. We don’t provide service on weekends." Commonly Asked Questions: -When is the earliest I can schedule? -"The soonest we can schedule is the day after tomorrow. For example, right now it’s Thursday, February 22nd, 12:35 PM, so the first available slot is Monday morning. However, an agent can call between 7:30 AM and 8:30 AM tomorrow." - -What are your hours? -"We're open 8 AM to 5 PM, Monday through Friday." - -When can I speak to a live agent? -"The earliest you can talk to someone is between 7:30 and 8:30 AM tomorrow." - -What time will the technician arrive? -"We provide open time frames, and our dispatcher will keep you updated throughout the day." - -Is there a service fee? -"The diagnostic fee is $79 unless you’re looking to replace your system, in which case we offer a free quote." - -Last Line: -Thank you for giving us the opportunity to earn your business. One of our agents will contact you to confirm your appointment. +*To schedule them in for a slot the earliest we can do is the day after tomorrow (or next business day). The current time is 12:35 PM Thursday, February 22nd so the first day you can schedule them is Monday morning. A live agent can still call between 7:30 AM to 8:30 AM tomorrow, Friday, February 23rd though. +What hours are you open? +8-5 Monday Though Friday, 5 days a week +When can we speak to a live agent? +The earliest that someone will return your call is between 730 and 8:30 AM the next day. +What time can you come out? +We do offer open time frames. Our dispatcher will keep you updated throughout the day. +Is there a service fee to come out? +It’s just $79 for the diagnostic fee unless you are looking to replace your system in which case we can offer a free quote. + +Last Line: +Thank you for the opportunity to earn your business, one of our agents will be in touch with you to confirm your appointment time. """ \ No newline at end of file diff --git a/src/simple_ui.py b/src/simple_ui.py deleted file mode 100644 index 7f0f99d..0000000 --- a/src/simple_ui.py +++ /dev/null @@ -1,117 +0,0 @@ -import numpy as np -import PySimpleGUI as sg -from loguru import logger - -from src import audio, llm -from src.constants import APPLICATION_WIDTH, OFF_IMAGE, ON_IMAGE - -history = "AI: Thank you for calling Dooley Service Pro, this is Sarah your virtual assistant how may I help you today!" - - -def get_text_area(text: str, size: tuple) -> sg.Text: - """ - Create a text area widget with the given text and size. - - Parameters: - text (str): The initial text to display in the text area. - size (tuple): The size of the text area widget. - - Returns: - sg.Text: The created text area widget. - """ - return sg.Text( - text, - size=size, - background_color=sg.theme_background_color(), - text_color="white", - ) - - -class BtnInfo: - def __init__(self, state=False): - self.state = state - - -# All the stuff inside your window: -sg.theme("DarkAmber") # Add a touch of color -record_status_button = sg.Button( - image_data=OFF_IMAGE, - k="-TOGGLE1-", - border_width=0, - button_color=(sg.theme_background_color(), sg.theme_background_color()), - disabled_button_color=(sg.theme_background_color(), sg.theme_background_color()), - metadata=BtnInfo(), -) -analyzed_text_label = get_text_area("", size=(APPLICATION_WIDTH, 2)) -quick_chat_gpt_answer = get_text_area("", size=(APPLICATION_WIDTH, 5)) -full_chat_gpt_answer = get_text_area("", size=(APPLICATION_WIDTH, 12)) - - -layout = [ - [sg.Text("Press R to start recording", size=(int(APPLICATION_WIDTH * 0.8), 2)), record_status_button], - [sg.Text("Press A to analyze the recording")], - [analyzed_text_label], - [sg.Text("Short answer:")], - [quick_chat_gpt_answer], - [sg.Text("Full answer:")], - [full_chat_gpt_answer], - [sg.Button("Cancel")], -] -WINDOW = sg.Window("Keyboard Test", layout, return_keyboard_events=True, use_default_focus=False) - - -def background_recording_loop() -> None: - audio_data = None - while record_status_button.metadata.state: - audio_sample = audio.record_batch() - if audio_data is None: - audio_data = audio_sample - else: - audio_data = np.vstack((audio_data, audio_sample)) - audio.save_audio_file(audio_data) - - -while True: - event, values = WINDOW.read() - if event in ["Cancel", sg.WIN_CLOSED]: - logger.debug("Closing...") - break - - if event in ("r", "R"): # start recording - logger.debug("Starting recording...") - record_status_button.metadata.state = not record_status_button.metadata.state - if record_status_button.metadata.state: - WINDOW.perform_long_operation(background_recording_loop, "-RECORDING-") - record_status_button.update(image_data=ON_IMAGE if record_status_button.metadata.state else OFF_IMAGE) - - elif event in ("a", "A"): # send audio to OpenAI Whisper model - logger.debug("Analyzing audio...") - analyzed_text_label.update("Start analyzing...") - WINDOW.perform_long_operation(llm.transcribe_audio_deepgram, "-WHISPER COMPLETED-") - - elif event == "-WHISPER COMPLETED-": - audio_transcript = values["-WHISPER COMPLETED-"] - analyzed_text_label.update(audio_transcript) - - # Generate quick answer: - # quick_chat_gpt_answer.update("Chatgpt is working...") - # WINDOW.perform_long_operation( - # lambda: llm.generate_answer(audio_transcript, short_answer=True, temperature=0, history=history), - # "-CHAT_GPT SHORT ANSWER-", - # ) - - # Generate full answer: - full_chat_gpt_answer.update("Chatgpt is working...") - WINDOW.perform_long_operation( - lambda: llm.generate_answer(audio_transcript, short_answer=False, temperature=0.7, history=history), - "-CHAT_GPT LONG ANSWER-" - ) - history += f"\nUSER: {values['-WHISPER COMPLETED-']}" - - # elif event == '-CHAT_GPT SHORT ANSWER-': - # history += f'\nAI: {values["-CHAT_GPT SHORT ANSWER-"]}' - # quick_chat_gpt_answer.update(values["-CHAT_GPT SHORT ANSWER-"]) - - elif event == "-CHAT_GPT LONG ANSWER-": - history += f'\nAI: {values["-CHAT_GPT LONG ANSWER-"]}' - full_chat_gpt_answer.update(values["-CHAT_GPT LONG ANSWER-"])