Enhancing Elevenlabs-tts with Improved V2 Voices Integration

Question

Enhancing Elevenlabs-tts with Improved V2 Voices Integration

brentjohnston opened this issue 6 months ago · comments

I've decided to post the code I've been working on. This code is for the much more realistic sounding V2 version of the Elevenlabs voices. I noticed the elevenlabs-tts extension seems to support only the V1 versions, and you can't adjust the style on the V2 version with it.

I had a chat with an Elevenlabs mod on their Discord, and he confirmed that the old extension doesn't pass the right settings to the API call for V2 voices/sliders. This was a big help in figuring out what needed to be fixed.

The script works well, but there are some bugs I hope someone can help with.

This updated code somehow makes the play button in the chat player not work and I don't know why. Even though it exports the .mp3 files to /extensions/elevenlabs-tts/outputs correctly. Also the sliders adjust the voice waay too much as compared to the website. The elevenlabs devs says it's correct so I have no idea.

If anyone can help out or thinks this could be a great fork, please go ahead.

To use this replace this script in the /extensions/elevenlabs-tts/script.py path from the elevenlabs-tts extension. The V2 voices do add a whole new level of immersion, especially with the adjustable voice style settings in the file.

Thanks in advance for any help or interest if taking this project further!

import html
import re
from pathlib import Path

import elevenlabs
import gradio as gr

from modules import chat, shared, ui_chat
from modules.logging_colors import logger
from modules.utils import gradio

params = {
    'activate': True,
    'api_key': None,
    'selected_voice': 'None',
    'autoplay': False,
    'show_text': True,
    'model': 'eleven_multilingual_v2',
    'stability': 0.7,  # Default value for stability
    'similarity_boost': 0.5,  # Default value for similarity boost
    'style': 0.5,  # Default style
    'use_speaker_boost': True,  # Default for use speaker boost
}

voices = None
wav_idx = 0
LANG_MODELS = ['eleven_multilingual_v2']

def update_api_key(key):
    params['api_key'] = key
    if key is not None:
        elevenlabs.set_api_key(key)

def refresh_voices():
    global params
    your_voices = elevenlabs.voices()
    voice_names = [voice.name for voice in your_voices]
    return voice_names

def refresh_voices_dd():
    all_voices = refresh_voices()
    return gr.Dropdown.update(value=all_voices[0], choices=all_voices)

def remove_tts_from_history(history):
    for i, entry in enumerate(history['internal']):
        history['visible'][i] = [history['visible'][i][0], entry[1]]

    return history

def toggle_text_in_history(history):
    for i, entry in enumerate(history['visible']):
        visible_reply = entry[1]
        if visible_reply.startswith('<audio'):
            if params['show_text']:
                reply = history['internal'][i][1]
                history['visible'][i] = [history['visible'][i][0], f"{visible_reply.split('</audio>')[0]}</audio>\n\n{reply}"]
            else:
                history['visible'][i] = [history['visible'][i][0], f"{visible_reply.split('</audio>')[0]}</audio>"]

    return history

def remove_surrounded_chars(string):
    # this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR
    # 'as few symbols as possible (0 upwards) between an asterisk and the end of the string'
    return re.sub('\*[^\*]*?(\*|$)', '', string)

def state_modifier(state):
    if not params['activate']:
        return state

    state['stream'] = False
    return state

def input_modifier(string):
    if not params['activate']:
        return string

    shared.processing_message = "*Is recording a voice message...*"
    return string

def history_modifier(history):
    # Remove autoplay from the last reply
    if len(history['internal']) > 0:
        history['visible'][-1] = [
            history['visible'][-1][0],
            history['visible'][-1][1].replace('controls autoplay>', 'controls>')
        ]

    return history
    
def output_modifier(string):
    global params, wav_idx

    if not params['activate']:
        return string

    original_string = string
    string = remove_surrounded_chars(string)
    string = string.replace('"', '')
    string = string.replace('“', '')
    string = string.replace('\n', ' ')
    string = string.strip()
    if string == '':
        string = 'empty reply, try regenerating'

    output_file = Path(f'extensions/elevenlabs_tts/outputs/{wav_idx:06d}.mp3')
    print(f'Outputting audio to {str(output_file)}')
    try:
        audio = elevenlabs.generate(
            text=html.unescape(string),
            voice=elevenlabs.Voice(
                voice_id=params['selected_voice'],
                settings=elevenlabs.VoiceSettings(
                    stability=params['stability'],
                    similarity_boost=params['similarity_boost'],
                    style=params['style'],
                    use_speaker_boost=params['use_speaker_boost']
                )
            ),
            model=params['model']
        )

        # Save the audio file
        with open(output_file, 'wb') as file:
            file.write(audio)

        # Update the HTML string to reference the saved audio file
        string = f'<audio src="{output_file.as_posix()}" controls {"autoplay" if params["autoplay"] else ""}></audio>'
        wav_idx += 1
    except elevenlabs.api.error.UnauthenticatedRateLimitError:
        string = "🤖 ElevenLabs Unauthenticated Rate Limit Reached - Please create an API key to continue\n\n"
    except elevenlabs.api.error.RateLimitError:
        string = "🤖 ElevenLabs API Tier Limit Reached\n\n"
    except elevenlabs.api.error.APIError as err:
        string = f"🤖 ElevenLabs Error: {err}\n\n"

    if params['show_text']:
        string += f'\n\n{original_string}'

    shared.processing_message = "*Is typing...*"
    return string

def ui():
    global voices
    if not voices:
        voices = refresh_voices()
        selected = params['selected_voice']
        if selected == 'None':
            params['selected_voice'] = voices[0]
        elif selected not in voices:
            logger.error(f'Selected voice {selected} not available, switching to {voices[0]}')
            params['selected_voice'] = voices[0]

    # Gradio elements
    with gr.Row():
        activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
        autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')
        show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
        
    with gr.Row():
        # Updated sliders with correct default values from params
        stability_slider = gr.Slider(minimum=0, maximum=1, step=0.1, value=params['stability'], label='Stability')
        similarity_boost_slider = gr.Slider(minimum=0, maximum=3, step=0.1, value=params['similarity_boost'], label='Similarity Boost')
        style_slider = gr.Slider(minimum=0, maximum=1, step=0.1, value=params['style'], label='Style')

    with gr.Row():
        voice = gr.Dropdown(value=params['selected_voice'], choices=voices, label='TTS Voice')
        refresh = gr.Button(value='Refresh')

    with gr.Row():
        if params['api_key']:
            api_key = gr.Textbox(value=params['api_key'], label='API Key')
            update_api_key(params['api_key'])
        else:
            api_key = gr.Textbox(placeholder="Enter your API key.", label='API Key')

    with gr.Row():
        model = gr.Dropdown(value=params['model'], choices=LANG_MODELS, label='Language model')

    with gr.Row():
        convert = gr.Button('Permanently replace audios with the message texts')
        convert_cancel = gr.Button('Cancel', visible=False)
        convert_confirm = gr.Button('Confirm (cannot be undone)', variant="stop", visible=False)

        # Event functions for sliders
        stability_slider.change(lambda x: params.update({'stability': x}), stability_slider, None)
        similarity_boost_slider.change(lambda x: params.update({'similarity_boost': x}), similarity_boost_slider, None)
        style_slider.change(lambda x: params.update({'style': x}), style_slider, None)

        # Convert history with confirmation
        convert_arr = [convert_confirm, convert, convert_cancel]
        convert.click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, convert_arr)
        convert_confirm.click(
            lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr).then(
            remove_tts_from_history, gradio('history'), gradio('history')).then(
            chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
            chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))

        convert_cancel.click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr)

        # Toggle message text in history
        show_text.change(
            lambda x: params.update({"show_text": x}), show_text, None).then(
            toggle_text_in_history, gradio('history'), gradio('history')).then(
            chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
            chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))

        # Event functions to update the parameters in the backend
        activate.change(lambda x: params.update({'activate': x}), activate, None)
        voice.change(lambda x: params.update({'selected_voice': x}), voice, None)
        api_key.change(update_api_key, api_key, None)
        model.change(lambda x: params.update({'model': x}), model, None)
        # connect.click(check_valid_api, [], connection_status)
        refresh.click(refresh_voices_dd, [], voice)
        # Event functions to update the parameters in the backend
        autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)

brentjohnston · Answer 1 · Thu Nov 23 2023 06:20:31 GMT+0800 (China Standard Time)

Edit: The sliders work afterall, I guess they just don't map exactly 1:1 to the how the website sounds, probably due to Elevenlabs api I'm guessing.

github-actions · Answer 2 · Sat Jan 20 2024 07:21:09 GMT+0800 (China Standard Time)

This issue has been closed due to inactivity for 6 weeks. If you believe it is still relevant, please leave a comment below. You can tag a developer in your comment.