How to fix destination container url folder in Azure Batch Transcription?

Bhikubhai 0 Reputation points
2025-02-12T16:36:16.2733333+00:00

import os

import datetime

import json

import time

import requests

from azure.storage.blob import BlobServiceClient

=======================

Configuration Settings

=======================

Provided container URL for audio files (source)

CONTENT_CONTAINER_URL = ("https://czrndstorage.blob.core.windows.net/"

                     "czrnd-container-transcription/?sp=rwl&st=2025-02-10T13:42:24Z&se=2025-02-10T21:42:24Z&spr=https&sv=2022-11-02&sr=c&sig=aYT0iJNVifAKeO6fJuL4FyqsdGF1nIl6wUrNMkwbOuc%3D")

Provided destination container URL for transcription output

DESTINATION_CONTAINER_URL = ("https://czrndstorage.blob.core.windows.net/czrnd-container-transcription?sp=rcwl&st=2025-02-12T14:43:01Z&se=2025-02-12T22:43:01Z&spr=https&sv=2022-11-02&sr=c&sig=heNpXoXNuAcj5NxK1BHYos5AqN9rsiKZo4BSeuRiXdk%3D")

Speech API configuration

SPEECH_API_KEY = "P1e+CiSf+DnwNBGhoCiHyYEtRLIh+AStqUZtug=="

REGION = "centralindia"

TRANSCRIPTION_ENDPOINT = f"https://{REGION}.api.cognitive.microsoft.com/speechtotext/v3.2/transcriptions"

Azure Blob Storage connection (you must supply your connection string)

BLOB_CONNECTION_STRING = "DefaultEndpointsProP1qcoVpIRkY58p33/XgGMOCFVoAX6wa91cxR4eor43YsZMc3bne+CiSf+DnwNBGhoCiHyYEtRLIh+AStqUZtug==;EndpointSuffix=core.windows.net"

The name of the container where audio files are uploaded (extracted from the CONTENT_CONTAINER_URL)

CONTENT_CONTAINER_NAME = "czrnd-container-transcription"

Local folder where your audio files are stored

LOCAL_AUDIO_FOLDER = "/content/audio" # update this path

=======================

Helper Functions

=======================

def create_date_folder():

"""Returns a date string to be used as a folder name (e.g. '2025-02-11')."""

return datetime.datetime.now().strftime("%Y-%m-%d")

def upload_files_to_blob(date_folder):

"""

Uploads all wav/mp3 files from LOCAL_AUDIO_FOLDER to the Azure Blob container

under a folder with the given date_folder prefix.

"""

print("Connecting to Azure Blob Storage...")

blob_service_client = BlobServiceClient.from_connection_string(BLOB_CONNECTION_STRING)

container_client = blob_service_client.get_container_client(CONTENT_CONTAINER_NAME)

# Create the container if it doesn't exist (ignore error if it does)

try:

    container_client.create_container()

except Exception:

    pass  # Container likely exists already

for file_name in os.listdir(LOCAL_AUDIO_FOLDER):

    if file_name.lower().endswith((".wav", ".mp3")):

        file_path = os.path.join(LOCAL_AUDIO_FOLDER, file_name)

        blob_name = f"{date_folder}/{file_name}"  # e.g., "2025-02-11/myfile.wav"

        blob_client = container_client.get_blob_client(blob_name)

        with open(file_path, "rb") as data:

            blob_client.upload_blob(data, overwrite=True)

        print(f"Uploaded '{file_name}' as '{blob_name}'")

def start_transcription():

"""

Calls the bulk transcription API and extracts the transcription ID from the response.

"""

headers = {

    "Ocp-Apim-Subscription-Key": "5Pf0vTXuUhWZVomHF19QJkdlDmfTGAYACOG5Fnc",

    "Content-Type": "application/json"

}

request_body = {

    "contentContainerUrl": CONTENT_CONTAINER_URL,

    "locale": "en-IN",

    "displayName": "My Transcription",

    "model": None,

    "properties": {

        "wordLevelTimestampsEnabled": True,

        "diarizationEnabled": True,

        "destinationContainerUrl": DESTINATION_CONTAINER_URL

    }

}

print("Starting bulk transcription...")

response = requests.post(TRANSCRIPTION_ENDPOINT, headers=headers, json=request_body)

if response.status_code not in [200, 201]:

    print("Error starting transcription:", response.text)

    return None, None

data = response.json()

# Extract transcription id from the 'self' URL, e.g.:

# "https://centralindia.api.cognitive.microsoft.com/speechtotext/v3.2/transcriptions/ab13f0b4-2d94-4b23-9d1f-f7f1d3e54057"

self_url = data.get("self", "")

transcription_id = self_url.split("/")[-1] if self_url else None

print("Transcription ID:", transcription_id)

return transcription_id, data

def check_transcription_status(transcription_id):

"""

Polls the transcription API for status updates until it reaches Succeeded or Failed.

"""

headers = {"Ocp-Apim-Subscription-Key": "5Pf0vTXuUhWZVomHF19QJkdlDmfTGACOG5Fnc"}

status_endpoint = f"{TRANSCRIPTION_ENDPOINT}/{transcription_id}"

print("Checking transcription status...")

while True:

    response = requests.get(status_endpoint, headers=headers)

    if response.status_code != 200:

        print("Error checking status:", response.text)

        return None

    data = response.json()

    status = data.get("status", "")

    print("Current status:", status)

    if status == "Succeeded":

        return data

    elif status == "Failed":

        print("Transcription failed. Details:", json.dumps(data, indent=2))

        return None

    # Wait before polling again

    time.sleep(15)

def download_transcription(transcription_data):

"""

Uses the 'links' information from the transcription data to download the transcription files.

"""

files_url = transcription_data.get("links", {}).get("files", None)

if not files_url:

    print("Files URL not found in transcription data.")

    return

headers = {"Ocp-Apim-Subscription-Key": "5Pf0vTXuUhWZVomHF19AAYACOG5Fnc"}

print("Downloading transcription files from:", files_url)

response = requests.get(files_url, headers=headers)

if response.status_code != 200:

    print("Error downloading transcription files:", response.text)

    return

# Assume the response is a JSON array with details of each file

files_info = response.json()

for file_info in files_info:

    # Each file info should include a URL to download the file content

    file_url = file_info.get("links", {}).get("contentUrl", None)

    if file_url:

        file_name = file_info.get("name", "transcription.json")

        file_resp = requests.get(file_url)

        if file_resp.status_code == 200:

            with open(file_name, "wb") as f:

                f.write(file_resp.content)

            print(f"Downloaded transcription file: {file_name}")

        else:

            print("Error downloading file:", file_resp.text)

    else:

        print("No contentUrl found for a transcription file.")

=======================

Main Program Flow

=======================

def main():

# Step 1: Create a date-wise folder name and upload local audio files to that folder in the container.

date_folder = create_date_folder()

print("Using date folder:", date_folder)

upload_files_to_blob(date_folder)

# Step 2: Start the transcription job.

transcription_id, transcription_data = start_transcription()

if not transcription_id:

    return

# Step 3: Poll until transcription is complete.

final_data = check_transcription_status(transcription_id)

if final_data:

    print("Transcription succeeded. Downloading transcription files...")

    download_transcription(final_data)

else:

    print("Transcription did not succeed or encountered an error.")

if name == "main":

main()

We are using above python code for batch transcription this shows below error multiple times.
Hey Azure Community tell me how to resolve this issue.

Using date folder: 2025-02-12 Connecting to Azure Blob Storage... Uploaded 'file1.wav' as '2025-02-12/file1.wav' Uploaded 'file3.wav' as '2025-02-12/file3.wav' Uploaded 'file2.mp3' as '2025-02-12/file2.mp3' Uploaded 'file4.mp3' as '2025-02-12/file4.mp3' Uploaded 'file4.wav' as '2025-02-12/file4.wav' Uploaded 'file3.mp3' as '2025-02-12/file3.mp3' Uploaded 'file5.wav' as '2025-02-12/file5.wav' Uploaded 'file2.wav' as '2025-02-12/file2.wav' Uploaded 'file5.mp3' as '2025-02-12/file5.mp3' Uploaded 'file1.mp3' as '2025-02-12/file1.mp3' Starting bulk transcription... Transcription ID: 52acd530-a1bb-4e4d-8051-4a3d236eece0 Checking transcription status... Current status: Running Current status: Failed Transcription failed. Details: { "self": "https://centralindia.api.cognitive.microsoft.com/speechtotext/v3.2/transcriptions/52acd530-a1bb-4e4d-8051-4a3d236eece0", "displayName": "My Transcription", "locale": "en-IN", "createdDateTime": "2025-02-12T14:44:53Z", "lastActionDateTime": "2025-02-12T14:44:57Z", "status": "Failed", "model": { "self": "https://centralindia.api.cognitive.microsoft.com/speechtotext/v3.2/models/base/7b56a83a-3ccd-4dbb-9574-e96e3bef805c" }, "links": { "files": "https://centralindia.api.cognitive.microsoft.com/speechtotext/v3.2/transcriptions/52acd530-a1bb-4e4d-8051-4a3d236eece0/files" }, "properties": { "diarizationEnabled": true, "wordLevelTimestampsEnabled": true, "displayFormWordLevelTimestampsEnabled": false, "channels": [ 0, 1 ], "punctuationMode": "DictatedAndAutomatic", "profanityFilterMode": "Masked", "destinationContainerUrl": "https://czrndstorage.blob.core.windows.net/czrnd-container-transcription?sp=rcwl&st=2025-02-12T14:43:01Z&se=2025-02-12T22:43:01Z&spr=https&sv=2022-11-02&sr=c&sig=heNpXoXNuAcj5NxK1BHYos5AqN9rsiKZo4BSeuRiXdk%3D",

"error": { "code": "InvalidUri", "message": "The provided container URI is invalid." } } } Transcription did not succeed or encountered an error.

Azure AI Speech
Azure AI Speech
An Azure service that integrates speech processing into apps and services.
1,910 questions
Azure Blob Storage
Azure Blob Storage
An Azure service that stores unstructured data in the cloud as blobs.
3,082 questions
{count} votes

Your answer

Answers can be marked as Accepted Answers by the question author, which helps users to know the answer solved the author's problem.