expanded readme, updates to agent and speak for customization

This commit is contained in:
maglore9900 2024-10-02 13:50:51 -04:00
parent 9ed115a100
commit 4f79a3f01a
4 changed files with 51 additions and 13 deletions

View File

@ -2,6 +2,8 @@ this is a personal project to create a voice directed digital assistant based on
![alt text](https://www.cartoonbrew.com/wp-content/uploads/2013/05/maxheadroom_main-1280x600.jpg) ![alt text](https://www.cartoonbrew.com/wp-content/uploads/2013/05/maxheadroom_main-1280x600.jpg)
#SUMMARY
written in python, using langchain, langgraph, etc. written in python, using langchain, langgraph, etc.
written to work on Windows. Agent and logic will run on linux but tools are currently windows only. written to work on Windows. Agent and logic will run on linux but tools are currently windows only.
@ -26,7 +28,7 @@ Will move more variables to the .env folders soon.
.env is under the module folder for now .env is under the module folder for now
INSTALLATION #INSTALLATION
so basically the steps are pretty simple so basically the steps are pretty simple
@ -38,3 +40,35 @@ so basically the steps are pretty simple
- then copy example_env.txt to `.env` - then copy example_env.txt to `.env`
- open that, and put in your info, like openai key or ollama or whatever - open that, and put in your info, like openai key or ollama or whatever
- then run `python main.py` to start the whole thing up - then run `python main.py` to start the whole thing up
#TOOLS
##Spotify
you will need get your spotify credentials in order to have Max control your spotify software.
you can find information on getting that information here: https://developer.spotify.com/documentation/web-api/concepts/apps
max can take the following commands: play, pause, stop, next, previous, favorite
*note: you can say really any words that are similiar, max will attempt to read your intent and use the right command
##Window Focus
this tool brings the focus of whatever app you name to the front, it will not open an app
*note: only works on windows
##Open App
this tool will open an application. when you run max it will create an index of the apps installed on your system
*note: only works on windows
##Timer
this tool will set a timer with a popup. you tell max to set a time for X time, it will convert it to seconds on the backend and create the timer.
the default timer will have a "clippy" popup, with potentially custom text

View File

@ -1,13 +1,16 @@
OPENAI_API_KEY='' OPENAI_API_KEY = ''
#LLM_TYPE will take openai, local, or hybrid #LLM_TYPE will take openai, local
LLM_TYPE='openai' LLM_TYPE = 'openai'
OPENAI_MODEL='gpt-4o-mini' OPENAI_MODEL = 'gpt-4o-mini'
#OLLAMA_MODEL will take any model you can load in ollama #OLLAMA_MODEL will take any model you can load in ollama
OLLAMA_MODEL='gemma2' OLLAMA_MODEL = 'gemma2'
OLLAMA_URL='http://localhost:11434' OLLAMA_URL = 'http://localhost:11434'
CHARACTER='max' #CHARACTER will take any character prompt you have in the modules/prompts.py file. 'max' or 'none' are the default options
CHARACTER = 'max'
#LISTEN_MODEL will take whisper or google, whisper is the default option and best for most cases #LISTEN_MODEL will take whisper or google, whisper is the default option and best for most cases
LISTEN_MODEL='whisper' LISTEN_MODEL='whisper'
#STREAM SPEAK URL is using the default url for Alltalk. If you dont have all talk you can ignore this, if you want to use a different service, simply replace the url
STREAM_SPEAK_URL = 'http://127.0.0.1:7851/api/tts-generate'
SPOTIFY_CLIENT_ID = '' SPOTIFY_CLIENT_ID = ''
SPOTIFY_CLIENT_SECRET = '' SPOTIFY_CLIENT_SECRET = ''
SPOTIFY_REDIRECT_URI = 'http://localhost:8888/callback' SPOTIFY_REDIRECT_URI = 'http://localhost:8888/callback'

View File

@ -17,7 +17,7 @@ class Agent:
self.ap = app_launcher.AppLauncher() self.ap = app_launcher.AppLauncher()
self.wf = windows_focus.WindowFocusManager() self.wf = windows_focus.WindowFocusManager()
self.llm = self.ad.llm_chat self.llm = self.ad.llm_chat
self.spk = speak.Speak(model=env("LISTEN_MODEL")) self.spk = speak.Speak(env)
self.prompt = hub.pull("hwchase17/openai-functions-agent") self.prompt = hub.pull("hwchase17/openai-functions-agent")
self.char = env("CHARACTER").lower() self.char = env("CHARACTER").lower()
self.char_prompt = getattr(prompts, self.char, "You are a helpful assistant. User Query: {query}") self.char_prompt = getattr(prompts, self.char, "You are a helpful assistant. User Query: {query}")

View File

@ -12,16 +12,17 @@ from pydub import AudioSegment
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
class Speak: class Speak:
def __init__(self, model="whisper"): def __init__(self, env):
self.url = "http://127.0.0.1:7851/api/tts-generate" self.url = env("STREAM_SPEAK_URL")
self.microphone = sr.Microphone() self.microphone = sr.Microphone()
self.engine = pyttsx3.init() self.engine = pyttsx3.init()
self.engine.setProperty('rate', 150) self.engine.setProperty('rate', 150)
self.model_name = model.lower() self.model_name = env("LISTEN_MODEL".lower(), default="whisper")
self.sample_rate = 16000 self.sample_rate = 16000
self.chunk_size = 1024 self.chunk_size = 1024
self.noise_threshold = 500 self.noise_threshold = 500
# Initialize transcription models # Initialize transcription models
if self.model_name == "whisper": if self.model_name == "whisper":
from faster_whisper import WhisperModel from faster_whisper import WhisperModel