ChatTTS-13.py
· 1.3 KiB · Python
Raw
###################################
# Sample a speaker from Gaussian.
rand_spk = chat.sample_random_speaker()
print(rand_spk) # save it for later timbre recovery
params_infer_code = ChatTTS.Chat.InferCodeParams(
spk_emb = rand_spk, # add sampled speaker
temperature = .3, # using custom temperature
top_P = 0.7, # top P decode
top_K = 20, # top K decode
)
###################################
# For sentence level manual control.
# use oral_(0-9), laugh_(0-2), break_(0-7)
# to generate special token in text to synthesize.
params_refine_text = ChatTTS.Chat.RefineTextParams(
prompt='[oral_2][laugh_0][break_6]',
)
wavs = chat.infer(
texts,
params_refine_text=params_refine_text,
params_infer_code=params_infer_code,
)
###################################
# For word level manual control.
text = 'What is [uv_break]your favorite english food?[laugh][lbreak]'
wavs = chat.infer(text, skip_refine_text=True, params_refine_text=params_refine_text, params_infer_code=params_infer_code)
"""
In some versions of torchaudio, the first line works but in other versions, so does the second line.
"""
try:
torchaudio.save("word_level_output.wav", torch.from_numpy(wavs[0]).unsqueeze(0), 24000)
except:
torchaudio.save("word_level_output.wav", torch.from_numpy(wavs[0]), 24000)
1 | ################################### |
2 | # Sample a speaker from Gaussian. |
3 | |
4 | rand_spk = chat.sample_random_speaker() |
5 | print(rand_spk) # save it for later timbre recovery |
6 | |
7 | params_infer_code = ChatTTS.Chat.InferCodeParams( |
8 | spk_emb = rand_spk, # add sampled speaker |
9 | temperature = .3, # using custom temperature |
10 | top_P = 0.7, # top P decode |
11 | top_K = 20, # top K decode |
12 | ) |
13 | |
14 | ################################### |
15 | # For sentence level manual control. |
16 | |
17 | # use oral_(0-9), laugh_(0-2), break_(0-7) |
18 | # to generate special token in text to synthesize. |
19 | params_refine_text = ChatTTS.Chat.RefineTextParams( |
20 | prompt='[oral_2][laugh_0][break_6]', |
21 | ) |
22 | |
23 | wavs = chat.infer( |
24 | texts, |
25 | params_refine_text=params_refine_text, |
26 | params_infer_code=params_infer_code, |
27 | ) |
28 | |
29 | ################################### |
30 | # For word level manual control. |
31 | |
32 | text = 'What is [uv_break]your favorite english food?[laugh][lbreak]' |
33 | wavs = chat.infer(text, skip_refine_text=True, params_refine_text=params_refine_text, params_infer_code=params_infer_code) |
34 | """ |
35 | In some versions of torchaudio, the first line works but in other versions, so does the second line. |
36 | """ |
37 | try: |
38 | torchaudio.save("word_level_output.wav", torch.from_numpy(wavs[0]).unsqueeze(0), 24000) |
39 | except: |
40 | torchaudio.save("word_level_output.wav", torch.from_numpy(wavs[0]), 24000) |