resolution for silero_tts not supporting long texts more than 1000 tokens :+1: :1st_place_medal:
Netmees opened this issue · comments
Describe the bug
not supporting lon texts mor than 1000 tokens
Is there an existing issue for this?
- I have searched the existing issues
Reproduction
ask something large
Screenshot
No response
Logs
model.save_wav(ssml_text=silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
File "<torch_package_0>.multi_acc_v3_package.py", line 366, in save_wav
audio = self.apply_tts(text=text,
^^^^^^^^^^^^^^^^^^^^^^^^^
File "<torch_package_0>.multi_acc_v3_package.py", line 340, in apply_tts
raise Exception("Model couldn't generate your text, probably it's too long")
Exception: Model couldn't generate your text, probably it's too long
System Info
xeon
solution inprove the code over several ia tools
import html
import json
import random
import time
from pathlib import Path
import gradio as gr
import torch
import numpy as np
import soundfile as sf
from extensions.silero_tts import tts_preprocessor
from modules import chat, shared, ui_chat
from modules.utils import gradio
torch._C._jit_set_profiling_mode(False)
params = {
'activate': True,
'speaker': 'en_56',
'language': 'English',
'model_id': 'v3_en',
'sample_rate': 48000,
'device': 'cpu',
'show_text': False,
'autoplay': True,
'voice_pitch': 'medium',
'voice_speed': 'medium',
'local_cache_path': '' # User can override the default cache path to something other via settings.json
}
current_params = params.copy()
with open(Path("extensions/silero_tts/languages.json"), encoding='utf8') as f:
languages = json.load(f)
voice_pitches = ['x-low', 'low', 'medium', 'high', 'x-high']
voice_speeds = ['x-slow', 'slow', 'medium', 'fast', 'x-fast']
Used for making text xml compatible, needed for voice pitch and speed control
table = str.maketrans({
"<": "<",
">": ">",
"&": "&",
"'": "'",
'"': """,
})
def xmlesc(txt):
return txt.translate(table)
def load_model():
torch_cache_path = torch.hub.get_dir() if params['local_cache_path'] == '' else params['local_cache_path']
model_path = torch_cache_path + "/snakers4_silero-models_master/src/silero/model/" + params['model_id'] + ".pt"
if Path(model_path).is_file():
print(f'\nUsing Silero TTS cached checkpoint found at {torch_cache_path}')
model, example_text = torch.hub.load(repo_or_dir=torch_cache_path + '/snakers4_silero-models_master/', model='silero_tts', language=languages[params['language']]["lang_id"], speaker=params['model_id'], source='local', path=model_path, force_reload=True)
else:
print(f'\nSilero TTS cache not found at {torch_cache_path}. Attempting to download...')
model, example_text = torch.hub.load(repo_or_dir='snakers4/silero-models', model='silero_tts', language=languages[params['language']]["lang_id"], speaker=params['model_id'])
model.to(params['device'])
return model
def remove_tts_from_history(history):
for i, entry in enumerate(history['internal']):
history['visible'][i] = [history['visible'][i][0], entry[1]]
return history
def toggle_text_in_history(history):
for i, entry in enumerate(history['visible']):
visible_reply = entry[1]
if visible_reply.startswith('<audio'):
if params['show_text']:
reply = history['internal'][i][1]
history['visible'][i] = [history['visible'][i][0], f"{visible_reply.split('')[0]}\n\n{reply}"]
else:
history['visible'][i] = [history['visible'][i][0], f"{visible_reply.split('')[0]}"]
return history
def state_modifier(state):
if not params['activate']:
return state
state['stream'] = False
return state
def input_modifier(string, state):
if not params['activate']:
return string
shared.processing_message = "*Is recording a voice message...*"
return string
def history_modifier(history):
# Remove autoplay from the last reply
if len(history['internal']) > 0:
history['visible'][-1] = [
history['visible'][-1][0],
history['visible'][-1][1].replace('controls autoplay>', 'controls>')
]
return history
#def output_modifier(string, state):
#"""
#Modifies the output string based on various parameters and state.
#Args:
#string: The input string to be modified.
#state: A dictionary containing the current state of the system.
#Returns:
#The modified string.
#"""
#global model, current_params, streaming_state
## Check if parameters have changed and load the model if necessary
#for i in params:
#if params[i] != current_params[i]:
#model = load_model()
#current_params = params.copy()
#break
## If activation is disabled, return the original string
#if not params['activate']:
#return string
## Preprocess the string and handle empty replies
#original_string = string
#string = tts_preprocessor.preprocess(html.unescape(string))
#if string == '':
#string = '*Empty reply, try regenerating*'
#else:
## Generate the audio file and create the HTML audio element
#output_file = Path(f'extensions/silero_tts/outputs/{state["character_menu"]}_{int(time.time())}.wav')
#prosody = '<prosody rate="{}" pitch="{}">'.format(params['voice_speed'], params['voice_pitch'])
#silero_input = f'<speak>{prosody}{xmlesc(string)}</prosody></speak>'
#model.save_wav(ssml_text=silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
#autoplay = 'autoplay' if params['autoplay'] else ''
#string = f'<audio src="file/{output_file.as_posix()}" controls {autoplay}></audio>'
#if params['show_text']:
#string += f'\n\n{original_string}'
#shared.processing_message = "*Is typing...*"
#return string
def chunk_text(text, max_length=1000):
"""
Chunks a long text into smaller pieces for processing.
Args:
text: The input text to be chunked.
max_length: The maximum length of each chunk.
Returns:
A list of chunks.
"""
chunks = []
while len(text) > max_length:
chunk = text[:max_length]
last_period = chunk.rfind('.')
if last_period != -1:
chunk = text[:last_period + 1]
text = text[last_period + 1:]
else:
text = text[max_length:]
chunks.append(chunk)
chunks.append(text)
return chunks
def apply_tts(text, **kwargs):
"""
Applies text-to-speech using the my_tts
module.
Args:
text: The input text to be converted to speech.
**kwargs: Additional keyword arguments to pass to the `my_tts` module.
Returns:
The generated audio data.
"""
audio_data = my_tts.synthesize_speech(text, **kwargs)
return audio_data
def process_long_text(model, text, **kwargs):
"""
Processes a long text by chunking it and generating audio for each chunk.
Args:
model: The TTS model to use.
text: The input text to be processed.
**kwargs: Additional keyword arguments to pass to the TTS model.
Returns:
The concatenated audio for the entire text.
"""
chunks = chunk_text(text)
audio_chunks = []
for chunk in chunks:
audio_chunk = model.apply_tts(text=chunk, **kwargs)
audio_chunks.append(audio_chunk)
return np.concatenate(audio_chunks)
def output_modifier(string, state):
global model, current_params, streaming_state
# Check if parameters have changed and load the model if necessary
for i in params:
if params[i] != current_params[i]:
model = load_model()
current_params = params.copy()
break
# If activation is disabled, return the original string
if not params['activate']:
return string
# Preprocess the string and handle empty replies
original_string = string
string = tts_preprocessor.preprocess(html.unescape(string))
if string == '':
string = '*Empty reply, try regenerating*'
else:
# Generate the audio file and create the HTML audio element
output_file = Path(f'extensions/silero_tts/outputs/{state["character_menu"]}_{int(time.time())}.wav')
prosody = '<prosody rate="{}" pitch="{}">'.format(params['voice_speed'], params['voice_pitch'])
silero_input = f'<speak>{prosody}{xmlesc(string)}</prosody></speak>'
# Use the process_long_text function to handle longer texts
audio = process_long_text(model, silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']))
# Save the concatenated audio to a file
sf.write(str(output_file), audio, int(params['sample_rate']))
autoplay = 'autoplay' if params['autoplay'] else ''
string = f'<audio src="file/{output_file.as_posix()}" controls {autoplay}></audio>'
if params['show_text']:
string += f'\n\n{original_string}'
shared.processing_message = "*Is typing...*"
return string
def setup():
global model
model = load_model()
def random_sentence():
with open(Path("extensions/silero_tts/harvard_sentences.txt")) as f:
return random.choice(list(f))
def voice_preview(string):
global model, current_params, streaming_state
for i in params:
if params[i] != current_params[i]:
model = load_model()
current_params = params.copy()
break
string = tts_preprocessor.preprocess(string or random_sentence())
output_file = Path('extensions/silero_tts/outputs/voice_preview.wav')
prosody = f"<prosody rate=\"{params['voice_speed']}\" pitch=\"{params['voice_pitch']}\">"
silero_input = f'<speak>{prosody}{xmlesc(string)}</prosody></speak>'
model.save_wav(ssml_text=silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
return f'<audio src="file/{output_file.as_posix()}?{int(time.time())}" controls autoplay></audio>'
def language_change(lang):
global params
params.update({"language": lang, "speaker": languages[lang]["default_voice"], "model_id": languages[lang]["model_id"]})
return gr.update(choices=languages[lang]["voices"], value=languages[lang]["default_voice"])
def custom_css():
path_to_css = Path(file).parent.resolve() / 'style.css'
return open(path_to_css, 'r').read()
def ui():
# Gradio elements
with gr.Accordion("Silero TTS"):
with gr.Row():
activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')
show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
with gr.Row():
language = gr.Dropdown(value=params['language'], choices=sorted(languages.keys()), label='Language')
voice = gr.Dropdown(value=params['speaker'], choices=languages[params['language']]["voices"], label='TTS voice')
with gr.Row():
v_pitch = gr.Dropdown(value=params['voice_pitch'], choices=voice_pitches, label='Voice pitch')
v_speed = gr.Dropdown(value=params['voice_speed'], choices=voice_speeds, label='Voice speed')
with gr.Row():
preview_text = gr.Text(show_label=False, placeholder="Preview text", elem_id="silero_preview_text")
preview_play = gr.Button("Preview")
preview_audio = gr.HTML(visible=False)
with gr.Row():
convert = gr.Button('Permanently replace audios with the message texts')
convert_cancel = gr.Button('Cancel', visible=False)
convert_confirm = gr.Button('Confirm (cannot be undone)', variant="stop", visible=False)
# Convert history with confirmation
convert_arr = [convert_confirm, convert, convert_cancel]
convert.click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, convert_arr)
convert_confirm.click(
lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr).then(
remove_tts_from_history, gradio('history'), gradio('history')).then(
chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))
convert_cancel.click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr)
# Toggle message text in history
show_text.change(
lambda x: params.update({"show_text": x}), show_text, None).then(
toggle_text_in_history, gradio('history'), gradio('history')).then(
chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))
# Event functions to update the parameters in the backend
activate.change(lambda x: params.update({"activate": x}), activate, None)
autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)
language.change(language_change, language, voice, show_progress=False)
voice.change(lambda x: params.update({"speaker": x}), voice, None)
v_pitch.change(lambda x: params.update({"voice_pitch": x}), v_pitch, None)
v_speed.change(lambda x: params.update({"voice_speed": x}), v_speed, None)
# Play preview
preview_text.submit(voice_preview, preview_text, preview_audio)
preview_play.click(voice_preview, preview_text, preview_audio)
main modifications:
add:
import numpy as np
import soundfile as sf
add and sustitute :
def chunk_text(text, max_length=1000):
"""
Chunks a long text into smaller pieces for processing.
Args:
text: The input text to be chunked.
max_length: The maximum length of each chunk.
Returns:
A list of chunks.
"""
chunks = []
while len(text) > max_length:
chunk = text[:max_length]
last_period = chunk.rfind('.')
if last_period != -1:
chunk = text[:last_period + 1]
text = text[last_period + 1:]
else:
text = text[max_length:]
chunks.append(chunk)
chunks.append(text)
return chunks
def apply_tts(text, **kwargs):
"""
Applies text-to-speech using the my_tts
module.
Args:
text: The input text to be converted to speech.
**kwargs: Additional keyword arguments to pass to the `my_tts` module.
Returns:
The generated audio data.
"""
audio_data = my_tts.synthesize_speech(text, **kwargs)
return audio_data
def process_long_text(model, text, **kwargs):
"""
Processes a long text by chunking it and generating audio for each chunk.
Args:
model: The TTS model to use.
text: The input text to be processed.
**kwargs: Additional keyword arguments to pass to the TTS model.
Returns:
The concatenated audio for the entire text.
"""
chunks = chunk_text(text)
audio_chunks = []
for chunk in chunks:
audio_chunk = model.apply_tts(text=chunk, **kwargs)
audio_chunks.append(audio_chunk)
return np.concatenate(audio_chunks)
def output_modifier(string, state):
global model, current_params, streaming_state
# Check if parameters have changed and load the model if necessary
for i in params:
if params[i] != current_params[i]:
model = load_model()
current_params = params.copy()
break
# If activation is disabled, return the original string
if not params['activate']:
return string
# Preprocess the string and handle empty replies
original_string = string
string = tts_preprocessor.preprocess(html.unescape(string))
if string == '':
string = '*Empty reply, try regenerating*'
else:
# Generate the audio file and create the HTML audio element
output_file = Path(f'extensions/silero_tts/outputs/{state["character_menu"]}_{int(time.time())}.wav')
prosody = '<prosody rate="{}" pitch="{}">'.format(params['voice_speed'], params['voice_pitch'])
silero_input = f'<speak>{prosody}{xmlesc(string)}</prosody></speak>'
# Use the process_long_text function to handle longer texts
audio = process_long_text(model, silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']))
# Save the concatenated audio to a file
sf.write(str(output_file), audio, int(params['sample_rate']))
autoplay = 'autoplay' if params['autoplay'] else ''
string = f'<audio src="file/{output_file.as_posix()}" controls {autoplay}></audio>'
if params['show_text']:
string += f'\n\n{original_string}'
shared.processing_message = "*Is typing...*"
return string