How to fix destination container url folder in Azure Batch Transcription?
import os
import datetime
import json
import time
import requests
from azure.storage.blob import BlobServiceClient
=======================
Configuration Settings
=======================
Provided container URL for audio files (source)
CONTENT_CONTAINER_URL = ("https://czrndstorage.blob.core.windows.net/"
"czrnd-container-transcription/?sp=rwl&st=2025-02-10T13:42:24Z&se=2025-02-10T21:42:24Z&spr=https&sv=2022-11-02&sr=c&sig=aYT0iJNVifAKeO6fJuL4FyqsdGF1nIl6wUrNMkwbOuc%3D")
Provided destination container URL for transcription output
DESTINATION_CONTAINER_URL = ("https://czrndstorage.blob.core.windows.net/czrnd-container-transcription?sp=rcwl&st=2025-02-12T14:43:01Z&se=2025-02-12T22:43:01Z&spr=https&sv=2022-11-02&sr=c&sig=heNpXoXNuAcj5NxK1BHYos5AqN9rsiKZo4BSeuRiXdk%3D")
Speech API configuration
SPEECH_API_KEY = "P1e+CiSf+DnwNBGhoCiHyYEtRLIh+AStqUZtug=="
REGION = "centralindia"
TRANSCRIPTION_ENDPOINT = f"https://{REGION}.api.cognitive.microsoft.com/speechtotext/v3.2/transcriptions"
Azure Blob Storage connection (you must supply your connection string)
BLOB_CONNECTION_STRING = "DefaultEndpointsProP1qcoVpIRkY58p33/XgGMOCFVoAX6wa91cxR4eor43YsZMc3bne+CiSf+DnwNBGhoCiHyYEtRLIh+AStqUZtug==;EndpointSuffix=core.windows.net"
The name of the container where audio files are uploaded (extracted from the CONTENT_CONTAINER_URL)
CONTENT_CONTAINER_NAME = "czrnd-container-transcription"
Local folder where your audio files are stored
LOCAL_AUDIO_FOLDER = "/content/audio" # update this path
=======================
Helper Functions
=======================
def create_date_folder():
"""Returns a date string to be used as a folder name (e.g. '2025-02-11')."""
return datetime.datetime.now().strftime("%Y-%m-%d")
def upload_files_to_blob(date_folder):
"""
Uploads all wav/mp3 files from LOCAL_AUDIO_FOLDER to the Azure Blob container
under a folder with the given date_folder prefix.
"""
print("Connecting to Azure Blob Storage...")
blob_service_client = BlobServiceClient.from_connection_string(BLOB_CONNECTION_STRING)
container_client = blob_service_client.get_container_client(CONTENT_CONTAINER_NAME)
# Create the container if it doesn't exist (ignore error if it does)
try:
container_client.create_container()
except Exception:
pass # Container likely exists already
for file_name in os.listdir(LOCAL_AUDIO_FOLDER):
if file_name.lower().endswith((".wav", ".mp3")):
file_path = os.path.join(LOCAL_AUDIO_FOLDER, file_name)
blob_name = f"{date_folder}/{file_name}" # e.g., "2025-02-11/myfile.wav"
blob_client = container_client.get_blob_client(blob_name)
with open(file_path, "rb") as data:
blob_client.upload_blob(data, overwrite=True)
print(f"Uploaded '{file_name}' as '{blob_name}'")
def start_transcription():
"""
Calls the bulk transcription API and extracts the transcription ID from the response.
"""
headers = {
"Ocp-Apim-Subscription-Key": "5Pf0vTXuUhWZVomHF19QJkdlDmfTGAYACOG5Fnc",
"Content-Type": "application/json"
}
request_body = {
"contentContainerUrl": CONTENT_CONTAINER_URL,
"locale": "en-IN",
"displayName": "My Transcription",
"model": None,
"properties": {
"wordLevelTimestampsEnabled": True,
"diarizationEnabled": True,
"destinationContainerUrl": DESTINATION_CONTAINER_URL
}
}
print("Starting bulk transcription...")
response = requests.post(TRANSCRIPTION_ENDPOINT, headers=headers, json=request_body)
if response.status_code not in [200, 201]:
print("Error starting transcription:", response.text)
return None, None
data = response.json()
# Extract transcription id from the 'self' URL, e.g.:
# "https://centralindia.api.cognitive.microsoft.com/speechtotext/v3.2/transcriptions/ab13f0b4-2d94-4b23-9d1f-f7f1d3e54057"
self_url = data.get("self", "")
transcription_id = self_url.split("/")[-1] if self_url else None
print("Transcription ID:", transcription_id)
return transcription_id, data
def check_transcription_status(transcription_id):
"""
Polls the transcription API for status updates until it reaches Succeeded or Failed.
"""
headers = {"Ocp-Apim-Subscription-Key": "5Pf0vTXuUhWZVomHF19QJkdlDmfTGACOG5Fnc"}
status_endpoint = f"{TRANSCRIPTION_ENDPOINT}/{transcription_id}"
print("Checking transcription status...")
while True:
response = requests.get(status_endpoint, headers=headers)
if response.status_code != 200:
print("Error checking status:", response.text)
return None
data = response.json()
status = data.get("status", "")
print("Current status:", status)
if status == "Succeeded":
return data
elif status == "Failed":
print("Transcription failed. Details:", json.dumps(data, indent=2))
return None
# Wait before polling again
time.sleep(15)
def download_transcription(transcription_data):
"""
Uses the 'links' information from the transcription data to download the transcription files.
"""
files_url = transcription_data.get("links", {}).get("files", None)
if not files_url:
print("Files URL not found in transcription data.")
return
headers = {"Ocp-Apim-Subscription-Key": "5Pf0vTXuUhWZVomHF19AAYACOG5Fnc"}
print("Downloading transcription files from:", files_url)
response = requests.get(files_url, headers=headers)
if response.status_code != 200:
print("Error downloading transcription files:", response.text)
return
# Assume the response is a JSON array with details of each file
files_info = response.json()
for file_info in files_info:
# Each file info should include a URL to download the file content
file_url = file_info.get("links", {}).get("contentUrl", None)
if file_url:
file_name = file_info.get("name", "transcription.json")
file_resp = requests.get(file_url)
if file_resp.status_code == 200:
with open(file_name, "wb") as f:
f.write(file_resp.content)
print(f"Downloaded transcription file: {file_name}")
else:
print("Error downloading file:", file_resp.text)
else:
print("No contentUrl found for a transcription file.")
=======================
Main Program Flow
=======================
def main():
# Step 1: Create a date-wise folder name and upload local audio files to that folder in the container.
date_folder = create_date_folder()
print("Using date folder:", date_folder)
upload_files_to_blob(date_folder)
# Step 2: Start the transcription job.
transcription_id, transcription_data = start_transcription()
if not transcription_id:
return
# Step 3: Poll until transcription is complete.
final_data = check_transcription_status(transcription_id)
if final_data:
print("Transcription succeeded. Downloading transcription files...")
download_transcription(final_data)
else:
print("Transcription did not succeed or encountered an error.")
if name == "main":
main()
We are using above python code for batch transcription this shows below error multiple times.
Hey Azure Community tell me how to resolve this issue.
Using date folder: 2025-02-12 Connecting to Azure Blob Storage... Uploaded 'file1.wav' as '2025-02-12/file1.wav' Uploaded 'file3.wav' as '2025-02-12/file3.wav' Uploaded 'file2.mp3' as '2025-02-12/file2.mp3' Uploaded 'file4.mp3' as '2025-02-12/file4.mp3' Uploaded 'file4.wav' as '2025-02-12/file4.wav' Uploaded 'file3.mp3' as '2025-02-12/file3.mp3' Uploaded 'file5.wav' as '2025-02-12/file5.wav' Uploaded 'file2.wav' as '2025-02-12/file2.wav' Uploaded 'file5.mp3' as '2025-02-12/file5.mp3' Uploaded 'file1.mp3' as '2025-02-12/file1.mp3' Starting bulk transcription... Transcription ID: 52acd530-a1bb-4e4d-8051-4a3d236eece0 Checking transcription status... Current status: Running Current status: Failed Transcription failed. Details: { "self": "https://centralindia.api.cognitive.microsoft.com/speechtotext/v3.2/transcriptions/52acd530-a1bb-4e4d-8051-4a3d236eece0", "displayName": "My Transcription", "locale": "en-IN", "createdDateTime": "2025-02-12T14:44:53Z", "lastActionDateTime": "2025-02-12T14:44:57Z", "status": "Failed", "model": { "self": "https://centralindia.api.cognitive.microsoft.com/speechtotext/v3.2/models/base/7b56a83a-3ccd-4dbb-9574-e96e3bef805c" }, "links": { "files": "https://centralindia.api.cognitive.microsoft.com/speechtotext/v3.2/transcriptions/52acd530-a1bb-4e4d-8051-4a3d236eece0/files" }, "properties": { "diarizationEnabled": true, "wordLevelTimestampsEnabled": true, "displayFormWordLevelTimestampsEnabled": false, "channels": [ 0, 1 ], "punctuationMode": "DictatedAndAutomatic", "profanityFilterMode": "Masked", "destinationContainerUrl": "https://czrndstorage.blob.core.windows.net/czrnd-container-transcription?sp=rcwl&st=2025-02-12T14:43:01Z&se=2025-02-12T22:43:01Z&spr=https&sv=2022-11-02&sr=c&sig=heNpXoXNuAcj5NxK1BHYos5AqN9rsiKZo4BSeuRiXdk%3D",
"error": { "code": "InvalidUri", "message": "The provided container URI is invalid." } } } Transcription did not succeed or encountered an error.