Hello @Sergio Ricardo de Freitas Oliveira,
In addition to kothapally Snigdha's comments about enabling key-value pairs with features=[DocumentAnalysisFeature.KEY_VALUE_PAIRS]
in prebuilt-layout
under azure.ai.documentintelligence
, this is necessary since prebuilt-document
has been deprecated in the latest Document Intelligence API version (2023-10-31-preview).
To accurately extract key-value pairs, you can try the code below with prebuilt-layout , which has worked for me:
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import DocumentAnalysisFeature, AnalyzeResult
from azure.core.exceptions import HttpResponseError
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
endpoint = "AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT"
key = "AZURE_DOCUMENT_INTELLIGENCE_KEY"
if not endpoint or not key:
raise ValueError("Missing Azure Document Intelligence API credentials in environment variables.")
document_intelligence_client = DocumentIntelligenceClient(
endpoint=endpoint,
credential=AzureKeyCredential(key)
)
path_to_sample_document = os.path.abspath(
os.path.join(os.path.dirname(__file__), "C://ipdf.pdf")
)
if not os.path.exists(path_to_sample_document):
raise FileNotFoundError(f"Document not found at {path_to_sample_document}")
def analyze_languages():
"""Detect languages present in the document."""
try:
with open(path_to_sample_document, "rb") as f:
poller = document_intelligence_client.begin_analyze_document(
"prebuilt-layout",
f, # Pass the file stream as a positional argument
features=[DocumentAnalysisFeature.LANGUAGES],
content_type="application/octet-stream",
)
result: AnalyzeResult = poller.result()
print("----Languages detected in the document----")
if result.languages:
print(f"Detected {len(result.languages)} languages:")
for lang_idx, lang in enumerate(result.languages):
print(f"- Language #{lang_idx}: locale '{lang.locale}', Confidence: {lang.confidence}")
extracted_text = ",".join(
[result.content[span.offset: span.offset + span.length] for span in lang.spans]
)
print(f" Extracted Text: '{extracted_text}'")
else:
print("No languages detected.")
except HttpResponseError as error:
print("Error during language detection:", error)
raise
def analyze_key_value_pairs():
"""Extract key-value pairs from the document."""
try:
with open(path_to_sample_document, "rb") as f:
poller = document_intelligence_client.begin_analyze_document(
"prebuilt-layout",
f, # Pass the file stream as a positional argument
features=[DocumentAnalysisFeature.KEY_VALUE_PAIRS],
content_type="application/octet-stream",
)
result: AnalyzeResult = poller.result()
print("----Key-Value Pairs found in document----")
if result.key_value_pairs:
for kv_pair in result.key_value_pairs:
if kv_pair.key:
print(f"Key: '{kv_pair.key.content}' found in '{kv_pair.key.bounding_regions}'")
if kv_pair.value:
print(f"Value: '{kv_pair.value.content}' found in '{kv_pair.value.bounding_regions}'\n")
else:
print("No key-value pairs detected. Try enabling 'Key-Value Pairs' in Document Intelligence Studio.")
except HttpResponseError as error:
print("Error during key-value pair extraction:", error)
raise
if __name__ == "__main__":
try:
analyze_languages()
analyze_key_value_pairs()
except Exception as e:
print("An unexpected error occurred:", e)
Output:
Hope this helps!
If you found this answer helpful, please click Accept Answer and kindly upvote it.
If you have any further questions, please click Comment.