Document Intelligence API returning incomplete KeyValuePairs (incorrectly detected)

Question

Hello all.

I have a scanned PDF which is correctly extracted to text by AI Document Intelligence, but an existing string that appears in its pages 1 and 2 (a string that is correctly extrated to text from both pages) only appears one time in the KeyValuePair data structure (it appears only for page 2, even though it's the very same string that was correctly extracted to text from both pages).

The document is a generic-type PDF, not from a specific form.

Any hints?

Thank you.

-SR

Answer

Hello @Sergio Ricardo de Freitas Oliveira,

In addition to kothapally Snigdha's comments about enabling key-value pairs with features=[DocumentAnalysisFeature.KEY_VALUE_PAIRS] in prebuilt-layout under azure.ai.documentintelligence, this is necessary since prebuilt-document has been deprecated in the latest Document Intelligence API version (2023-10-31-preview).

To accurately extract key-value pairs, you can try the code below with prebuilt-layout , which has worked for me:


import os

from azure.core.credentials import AzureKeyCredential

from azure.ai.documentintelligence import DocumentIntelligenceClient

from azure.ai.documentintelligence.models import DocumentAnalysisFeature, AnalyzeResult

from azure.core.exceptions import HttpResponseError

from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

endpoint = "AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT"

key = "AZURE_DOCUMENT_INTELLIGENCE_KEY"

if not endpoint or not key:

    raise ValueError("Missing Azure Document Intelligence API credentials in environment variables.")

document_intelligence_client = DocumentIntelligenceClient(

    endpoint=endpoint, 

    credential=AzureKeyCredential(key)

)

path_to_sample_document = os.path.abspath(

    os.path.join(os.path.dirname(__file__), "C://ipdf.pdf")

)

if not os.path.exists(path_to_sample_document):

    raise FileNotFoundError(f"Document not found at {path_to_sample_document}")

def analyze_languages():

    """Detect languages present in the document."""

    try:

        with open(path_to_sample_document, "rb") as f:

            poller = document_intelligence_client.begin_analyze_document(

                "prebuilt-layout",

                f,  # Pass the file stream as a positional argument

                features=[DocumentAnalysisFeature.LANGUAGES],

                content_type="application/octet-stream",

            )

        result: AnalyzeResult = poller.result()

        print("----Languages detected in the document----")

        if result.languages:

            print(f"Detected {len(result.languages)} languages:")

            for lang_idx, lang in enumerate(result.languages):

                print(f"- Language #{lang_idx}: locale '{lang.locale}', Confidence: {lang.confidence}")

                extracted_text = ",".join(

                    [result.content[span.offset: span.offset + span.length] for span in lang.spans]

                )

                print(f"  Extracted Text: '{extracted_text}'")

        else:

            print("No languages detected.")

    except HttpResponseError as error:

        print("Error during language detection:", error)

        raise

def analyze_key_value_pairs():

    """Extract key-value pairs from the document."""

    try:

        with open(path_to_sample_document, "rb") as f:

            poller = document_intelligence_client.begin_analyze_document(

                "prebuilt-layout",

                f,  # Pass the file stream as a positional argument

                features=[DocumentAnalysisFeature.KEY_VALUE_PAIRS],

                content_type="application/octet-stream",

            )

        result: AnalyzeResult = poller.result()

        print("----Key-Value Pairs found in document----")

        if result.key_value_pairs:

            for kv_pair in result.key_value_pairs:

                if kv_pair.key:

                    print(f"Key: '{kv_pair.key.content}' found in '{kv_pair.key.bounding_regions}'")

                if kv_pair.value:

                    print(f"Value: '{kv_pair.value.content}' found in '{kv_pair.value.bounding_regions}'
")

        else:

            print("No key-value pairs detected. Try enabling 'Key-Value Pairs' in Document Intelligence Studio.")

    except HttpResponseError as error:

        print("Error during key-value pair extraction:", error)

        raise

if __name__ == "__main__":

    try:

        analyze_languages()

        analyze_key_value_pairs()

    except Exception as e:

        print("An unexpected error occurred:", e)

Output:

Output

Hope this helps!

If you found this answer helpful, please click Accept Answer and kindly upvote it.

Accept Answer

If you have any further questions, please click Comment.

Share via

Document Intelligence API returning incomplete KeyValuePairs (incorrectly detected)

1 answer

Your answer