I need help with uploading documents involving vector embeddings?

Aravind Vijayaraghavan 20 Reputation points
2025-02-10T09:49:58.8533333+00:00

So I have an issue where when Im trying to upload documents without vector fields , it works well but when I'm involving vector fields which are vector embeddings, it displays this error:
**azure.search.documents._generated.operations._documents_operations.DocumentsOperations.index() got multiple values for keyword argument 'error_map'
**
This is my current code:

def upload_documents_to_search_client(df, chunk_size=32000):
    """Uploads documents to the search client in chunks (without embeddings)."""
    data = [
        {
            "@search.action": "mergeOrUpload",
            "hardware_id": str(row["hardware_id"]) if "hardware_id" in row else "",
            "text_feedback": str(row["text_feedback"]) if "text_feedback" in row else "",
            "uninstall_text_feedback": str(row["uninstall_text_feedback"]) if "uninstall_text_feedback" in row else "",
            "os": str(row["os"]) if "os" in row else "",
            "date_ymd": str(row["date_ymd"]) if "date_ymd" in row else "",
            "Feature_Category": str(row["Feature_Category"]) if "Feature_Category" in row else "",
            "Sentiment": str(row["Sentiment"]) if "Sentiment" in row else "",
            "country": str(row["country"]) if "country" in row else "",
            "aiid": str(map_aiid_to_label(row["aiid"])) if "aiid" in row else "",
            "version_app": str(row["version_app"]) if "version_app" in row else "",
            "os_version": str(row["version"]) if "version" in row else "",
            "architecture": str(row["architecture"]) if "architecture" in row else "",
            "score": str(row["score"]) if "score" in row else "",
            "region": str(row["region"]) if "region" in row else "",
            "city": str(row["city"]) if "city" in row else "",
        }
        for _, row in df.iterrows()
    ]
    
    for chunk in chunk_data(data, chunk_size):
        try:
            search_client.upload_documents(documents=chunk)
            print(f"Uploaded {len(chunk)} documents successfully.")
        except Exception as e:
            print(f"An error occurred during document upload: {e}")
            return None

def upload_documents_with_embeddings(df, embeddings_dict, chunk_size=32000):
    """Uploads only hardware_id documents with vector embeddings."""
    data = []

    valid_hardware_ids = {item["hardware_id"] for item in embeddings_dict}

    for _, row in df.iterrows():
        hardware_id = str(row["hardware_id"])

        if hardware_id in valid_hardware_ids:
            document = {
                "@search.action": "mergeOrUpload",
                "hardware_id": hardware_id,
                "text_feedback": str(row["text_feedback"]) if "text_feedback" in row else "",
                "uninstall_text_feedback": str(row["uninstall_text_feedback"]) if "uninstall_text_feedback" in row else "",
                "os": str(row["os"]) if "os" in row else "",
                "date_ymd": str(row["date_ymd"]) if "date_ymd" in row else "",
                "Feature_Category": str(row["Feature_Category"]) if "Feature_Category" in row else "",
                "Sentiment": str(row["Sentiment"]) if "Sentiment" in row else "",
                "country": str(row["country"]) if "country" in row else "",
                "aiid": str(map_aiid_to_label(row["aiid"])) if "aiid" in row else "",
                "version_app": str(row["version_app"]) if "version_app" in row else "",
                "os_version": str(row["version"]) if "version" in row else "",
                "architecture": str(row["architecture"]) if "architecture" in row else "",
                "score": str(row["score"]) if "score" in row else "",
                "region": str(row["region"]) if "region" in row else "",
                "city": str(row["city"]) if "city" in row else "",
                "vector_text_feedback": next(
                    (item["embeddings"].get("vector_text_feedback", []) for item in embeddings_dict if item["hardware_id"] == hardware_id),
                    []
                ),
                "vector_uninstall_feedback": next(
                    (item["embeddings"].get("vector_uninstall_feedback", []) for item in embeddings_dict if item["hardware_id"] == hardware_id),
                    []
                )
            }
            data.append(document)
    
    for chunk in chunk_data(data, chunk_size):
        try:
            search_client.upload_documents(documents=chunk)
            print(f"Uploaded {len(chunk)} documents with embeddings successfully.")
        except Exception as e:
            print(f"An error occurred during embeddings upload: {e}")
            return None

try:
    output_json_path = os.path.join(downloads_folder, "feedback_embeddings.json")
    with open(output_json_path, 'r') as json_file:
        embeddings_dict = json.load(json_file)
    
    upload_documents_to_search_client(df)
    upload_documents_with_embeddings(df, embeddings_dict)
    
    print("All documents and embeddings have been uploaded successfully.")

except Exception as e:
    print(f"An error occurred: {e}")

And I get this as my output:

Uploaded 32000 documents successfully.

Uploaded 32000 documents successfully.

Uploaded 32000 documents successfully.

Uploaded 28345 documents successfully.

An error occurred during embeddings upload: azure.search.documents._generated.operations._documents_operations.DocumentsOperations.index() got multiple values for keyword argument 'error_map'

All documents and embeddings have been uploaded successfully.

Basically the first function works but the 2nd one isnt. Can anyone help with this?

This is also the format of the json file:

[
    {
        "hardware_id": "example",
        "embeddings": {
            "vector_text_feedback": [],
            "vector_uninstall_feedback": []
        }
    }
]
    {
        "hardware_id": "C286E4952A934E3782D02259E4620AD899F33263848199AB062BD00A2DD2F9AE",
        "embeddings": {
            "vector_text_feedback": [],
            "vector_uninstall_feedback": []
        }
    }
]
Azure AI Search
Azure AI Search
An Azure search service with built-in artificial intelligence capabilities that enrich information to help identify and explore relevant content at scale.
1,185 questions
0 comments No comments
{count} votes

Accepted answer
  1. Sina Salam 17,571 Reputation points
    2025-02-11T10:53:37.1633333+00:00

    Hello Aravind Vijayaraghavan,

    Welcome to the Microsoft Q&A and thank you for posting your questions here.

    I understand that you are in need of help to upload documents involving vector embeddings.

    You have done a great job; the error was because of conflicting parameters when vector fields are included.

    • Start by making sure you have an updated or latest version of azure-search-documents to avoid known bugs using bash: pip install --upgrade azure-search-documents
    • Make sure the vector fields are lists of floats and that no document keys conflict with SDK parameters.

    Now, when calling upload_documents, explicitly pass the documents list to avoid parameter conflicts. The below code is a modified version of your code:

    def upload_documents_with_embeddings(df, embeddings_dict, chunk_size=32000):
        """Uploads only hardware_id documents with vector embeddings."""
        data = []
        valid_hardware_ids = {item["hardware_id"] for item in embeddings_dict}
        for _, row in df.iterrows():
            hardware_id = str(row["hardware_id"])
            if hardware_id in valid_hardware_ids:
                # Retrieve embeddings correctly
                embedding_entry = next(
                    (item for item in embeddings_dict if item["hardware_id"] == hardware_id),
                    None
                )
                if not embedding_entry:
                    continue  # Skip if no embedding found
                
                embeddings = embedding_entry.get("embeddings", {})
                
                document = {
                    "@search.action": "mergeOrUpload",
                    "hardware_id": hardware_id,
                    # Include other fields as before
                    "vector_text_feedback": embeddings.get("vector_text_feedback", []),
                    "vector_uninstall_feedback": embeddings.get("vector_uninstall_feedback", [])
                }
                data.append(document)
        
        for chunk in chunk_data(data, chunk_size):
            try:
                # Explicitly pass documents as a keyword argument
                search_client.upload_documents(documents=chunk)
                print(f"Uploaded {len(chunk)} documents with embeddings successfully.")
            except Exception as e:
                print(f"An error occurred during embeddings upload: {e}")
                return None
    

    I hope this is helpful! Do not hesitate to let me know if you have any other questions.


    Please don't forget to close up the thread here by upvoting and accept it as an answer if it is helpful.

    1 person found this answer helpful.
    0 comments No comments

0 additional answers

Sort by: Most helpful

Your answer

Answers can be marked as Accepted Answers by the question author, which helps users to know the answer solved the author's problem.