#While usinf text to speech services, can i also get a json which maps each word to time stamp in ge
1 messages · Page 1 of 1 (latest)
Yes, you can get a JSON that maps each character to its timestamp in the generated audio using our Timestamps API endpoint for Text-to-Speech generation. Here is an example script to achieve this:
import requests
import json
import base64
VOICE_ID = "21m00Tcm4TlvDq8ikWAM" # Rachel
YOUR_XI_API_KEY = "ENTER_YOUR_API_KEY_HERE"
url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/with-timestamps"
headers = {
"Content-Type": "application/json",
"xi-api-key": YOUR_XI_API_KEY
}
data = {
"text": (
"Born and raised in the charming south, "
"I can add a touch of sweet southern hospitality "
"to your audiobooks and podcasts"
),
"model_id": "eleven_multilingual_v2",
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.75
}
}
response = requests.post(
url,
json=data,
headers=headers,
)
if response.status_code != 200:
print(f"Error encountered, status: {response.status_code}, content: {response.text}")
quit()
# Convert the response which contains bytes into a JSON string from utf-8 encoding
json_string = response.content.decode("utf-8")
# Parse the JSON string and load the data as a dictionary
response_dict = json.loads(json_string)
# The "audio_base64" entry in the dictionary contains the audio as a base64 encoded string,
# we need to decode it into bytes in order to save the audio as a file
audio_bytes = base64.b64decode(response_dict["audio_base64"])
with open('output.mp3', 'wb') as f:
f.write(audio_bytes)
# The 'alignment' entry contains the mapping between input characters and their timestamps
print(response_dict['alignment'])
The output will be a dictionary containing three lists: characters, character_start_times_seconds, and character_end_times_seconds. Each character in the text will have corresponding start and end timestamps.
While using text to speech services, can i also get a json which maps each word to start and end time stamp in the generated audio?
Yes, you can get a JSON that maps each character to its start and end timestamps in the generated audio using our Timestamps API endpoint for Text-to-Speech generation. Here is an example script to achieve this:
import requests
import json
import base64
VOICE_ID = "21m00Tcm4TlvDq8ikWAM" # Rachel
YOUR_XI_API_KEY = "ENTER_YOUR_API_KEY_HERE"
url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/with-timestamps"
headers = {
"Content-Type": "application/json",
"xi-api-key": YOUR_XI_API_KEY
}
data = {
"text": (
"Born and raised in the charming south, "
"I can add a touch of sweet southern hospitality "
"to your audiobooks and podcasts"
),
"model_id": "eleven_multilingual_v2",
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.75
}
}
response = requests.post(
url,
json=data,
headers=headers,
)
if response.status_code != 200:
print(f"Error encountered, status: {response.status_code}, content: {response.text}")
quit()
# Convert the response which contains bytes into a JSON string from utf-8 encoding
json_string = response.content.decode("utf-8")
# Parse the JSON string and load the data as a dictionary
response_dict = json.loads(json_string)
# The "audio_base64" entry in the dictionary contains the audio as a base64 encoded string,
# we need to decode it into bytes in order to save the audio as a file
audio_bytes = base64.b64decode(response_dict["audio_base64"])
with open('output.mp3', 'wb') as f:
f.write(audio_bytes)
# The 'alignment' entry contains the mapping between input characters and their timestamps
print(response_dict['alignment'])
The output will be a dictionary containing three lists: characters, character_start_times_seconds, and character_end_times_seconds. Each character in the text will have corresponding start and end timestamps.
Great this helps a lot!