Enhancing Elevenlabs-tts with Improved V2 Voices Integration
brentjohnston opened this issue Β· comments
I've decided to post the code I've been working on. This code is for the much more realistic sounding V2 version of the Elevenlabs voices. I noticed the elevenlabs-tts extension seems to support only the V1 versions, and you can't adjust the style on the V2 version with it.
I had a chat with an Elevenlabs mod on their Discord, and he confirmed that the old extension doesn't pass the right settings to the API call for V2 voices/sliders. This was a big help in figuring out what needed to be fixed.
The script works well, but there are some bugs I hope someone can help with.
This updated code somehow makes the play button in the chat player not work and I don't know why. Even though it exports the .mp3 files to /extensions/elevenlabs-tts/outputs correctly. Also the sliders adjust the voice waay too much as compared to the website. The elevenlabs devs says it's correct so I have no idea.
If anyone can help out or thinks this could be a great fork, please go ahead.
To use this replace this script in the /extensions/elevenlabs-tts/script.py path from the elevenlabs-tts extension. The V2 voices do add a whole new level of immersion, especially with the adjustable voice style settings in the file.
Thanks in advance for any help or interest if taking this project further!
import html
import re
from pathlib import Path
import elevenlabs
import gradio as gr
from modules import chat, shared, ui_chat
from modules.logging_colors import logger
from modules.utils import gradio
params = {
'activate': True,
'api_key': None,
'selected_voice': 'None',
'autoplay': False,
'show_text': True,
'model': 'eleven_multilingual_v2',
'stability': 0.7, # Default value for stability
'similarity_boost': 0.5, # Default value for similarity boost
'style': 0.5, # Default style
'use_speaker_boost': True, # Default for use speaker boost
}
voices = None
wav_idx = 0
LANG_MODELS = ['eleven_multilingual_v2']
def update_api_key(key):
params['api_key'] = key
if key is not None:
elevenlabs.set_api_key(key)
def refresh_voices():
global params
your_voices = elevenlabs.voices()
voice_names = [voice.name for voice in your_voices]
return voice_names
def refresh_voices_dd():
all_voices = refresh_voices()
return gr.Dropdown.update(value=all_voices[0], choices=all_voices)
def remove_tts_from_history(history):
for i, entry in enumerate(history['internal']):
history['visible'][i] = [history['visible'][i][0], entry[1]]
return history
def toggle_text_in_history(history):
for i, entry in enumerate(history['visible']):
visible_reply = entry[1]
if visible_reply.startswith('<audio'):
if params['show_text']:
reply = history['internal'][i][1]
history['visible'][i] = [history['visible'][i][0], f"{visible_reply.split('</audio>')[0]}</audio>\n\n{reply}"]
else:
history['visible'][i] = [history['visible'][i][0], f"{visible_reply.split('</audio>')[0]}</audio>"]
return history
def remove_surrounded_chars(string):
# this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR
# 'as few symbols as possible (0 upwards) between an asterisk and the end of the string'
return re.sub('\*[^\*]*?(\*|$)', '', string)
def state_modifier(state):
if not params['activate']:
return state
state['stream'] = False
return state
def input_modifier(string):
if not params['activate']:
return string
shared.processing_message = "*Is recording a voice message...*"
return string
def history_modifier(history):
# Remove autoplay from the last reply
if len(history['internal']) > 0:
history['visible'][-1] = [
history['visible'][-1][0],
history['visible'][-1][1].replace('controls autoplay>', 'controls>')
]
return history
def output_modifier(string):
global params, wav_idx
if not params['activate']:
return string
original_string = string
string = remove_surrounded_chars(string)
string = string.replace('"', '')
string = string.replace('β', '')
string = string.replace('\n', ' ')
string = string.strip()
if string == '':
string = 'empty reply, try regenerating'
output_file = Path(f'extensions/elevenlabs_tts/outputs/{wav_idx:06d}.mp3')
print(f'Outputting audio to {str(output_file)}')
try:
audio = elevenlabs.generate(
text=html.unescape(string),
voice=elevenlabs.Voice(
voice_id=params['selected_voice'],
settings=elevenlabs.VoiceSettings(
stability=params['stability'],
similarity_boost=params['similarity_boost'],
style=params['style'],
use_speaker_boost=params['use_speaker_boost']
)
),
model=params['model']
)
# Save the audio file
with open(output_file, 'wb') as file:
file.write(audio)
# Update the HTML string to reference the saved audio file
string = f'<audio src="{output_file.as_posix()}" controls {"autoplay" if params["autoplay"] else ""}></audio>'
wav_idx += 1
except elevenlabs.api.error.UnauthenticatedRateLimitError:
string = "π€ ElevenLabs Unauthenticated Rate Limit Reached - Please create an API key to continue\n\n"
except elevenlabs.api.error.RateLimitError:
string = "π€ ElevenLabs API Tier Limit Reached\n\n"
except elevenlabs.api.error.APIError as err:
string = f"π€ ElevenLabs Error: {err}\n\n"
if params['show_text']:
string += f'\n\n{original_string}'
shared.processing_message = "*Is typing...*"
return string
def ui():
global voices
if not voices:
voices = refresh_voices()
selected = params['selected_voice']
if selected == 'None':
params['selected_voice'] = voices[0]
elif selected not in voices:
logger.error(f'Selected voice {selected} not available, switching to {voices[0]}')
params['selected_voice'] = voices[0]
# Gradio elements
with gr.Row():
activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')
show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
with gr.Row():
# Updated sliders with correct default values from params
stability_slider = gr.Slider(minimum=0, maximum=1, step=0.1, value=params['stability'], label='Stability')
similarity_boost_slider = gr.Slider(minimum=0, maximum=3, step=0.1, value=params['similarity_boost'], label='Similarity Boost')
style_slider = gr.Slider(minimum=0, maximum=1, step=0.1, value=params['style'], label='Style')
with gr.Row():
voice = gr.Dropdown(value=params['selected_voice'], choices=voices, label='TTS Voice')
refresh = gr.Button(value='Refresh')
with gr.Row():
if params['api_key']:
api_key = gr.Textbox(value=params['api_key'], label='API Key')
update_api_key(params['api_key'])
else:
api_key = gr.Textbox(placeholder="Enter your API key.", label='API Key')
with gr.Row():
model = gr.Dropdown(value=params['model'], choices=LANG_MODELS, label='Language model')
with gr.Row():
convert = gr.Button('Permanently replace audios with the message texts')
convert_cancel = gr.Button('Cancel', visible=False)
convert_confirm = gr.Button('Confirm (cannot be undone)', variant="stop", visible=False)
# Event functions for sliders
stability_slider.change(lambda x: params.update({'stability': x}), stability_slider, None)
similarity_boost_slider.change(lambda x: params.update({'similarity_boost': x}), similarity_boost_slider, None)
style_slider.change(lambda x: params.update({'style': x}), style_slider, None)
# Convert history with confirmation
convert_arr = [convert_confirm, convert, convert_cancel]
convert.click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, convert_arr)
convert_confirm.click(
lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr).then(
remove_tts_from_history, gradio('history'), gradio('history')).then(
chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))
convert_cancel.click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr)
# Toggle message text in history
show_text.change(
lambda x: params.update({"show_text": x}), show_text, None).then(
toggle_text_in_history, gradio('history'), gradio('history')).then(
chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))
# Event functions to update the parameters in the backend
activate.change(lambda x: params.update({'activate': x}), activate, None)
voice.change(lambda x: params.update({'selected_voice': x}), voice, None)
api_key.change(update_api_key, api_key, None)
model.change(lambda x: params.update({'model': x}), model, None)
# connect.click(check_valid_api, [], connection_status)
refresh.click(refresh_voices_dd, [], voice)
# Event functions to update the parameters in the backend
autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)
Edit: The sliders work afterall, I guess they just don't map exactly 1:1 to the how the website sounds, probably due to Elevenlabs api I'm guessing.
This issue has been closed due to inactivity for 6 weeks. If you believe it is still relevant, please leave a comment below. You can tag a developer in your comment.