I need help with uploading documents involving vector embeddings?

Question

So I have an issue where when Im trying to upload documents without vector fields , it works well but when I'm involving vector fields which are vector embeddings, it displays this error:
**azure.search.documents._generated.operations._documents_operations.DocumentsOperations.index() got multiple values for keyword argument 'error_map'
**
This is my current code:

def upload_documents_to_search_client(df, chunk_size=32000):
    """Uploads documents to the search client in chunks (without embeddings)."""
    data = [
        {
            "@search.action": "mergeOrUpload",
            "hardware_id": str(row["hardware_id"]) if "hardware_id" in row else "",
            "text_feedback": str(row["text_feedback"]) if "text_feedback" in row else "",
            "uninstall_text_feedback": str(row["uninstall_text_feedback"]) if "uninstall_text_feedback" in row else "",
            "os": str(row["os"]) if "os" in row else "",
            "date_ymd": str(row["date_ymd"]) if "date_ymd" in row else "",
            "Feature_Category": str(row["Feature_Category"]) if "Feature_Category" in row else "",
            "Sentiment": str(row["Sentiment"]) if "Sentiment" in row else "",
            "country": str(row["country"]) if "country" in row else "",
            "aiid": str(map_aiid_to_label(row["aiid"])) if "aiid" in row else "",
            "version_app": str(row["version_app"]) if "version_app" in row else "",
            "os_version": str(row["version"]) if "version" in row else "",
            "architecture": str(row["architecture"]) if "architecture" in row else "",
            "score": str(row["score"]) if "score" in row else "",
            "region": str(row["region"]) if "region" in row else "",
            "city": str(row["city"]) if "city" in row else "",
        }
        for _, row in df.iterrows()
    ]
    
    for chunk in chunk_data(data, chunk_size):
        try:
            search_client.upload_documents(documents=chunk)
            print(f"Uploaded {len(chunk)} documents successfully.")
        except Exception as e:
            print(f"An error occurred during document upload: {e}")
            return None

def upload_documents_with_embeddings(df, embeddings_dict, chunk_size=32000):
    """Uploads only hardware_id documents with vector embeddings."""
    data = []

    valid_hardware_ids = {item["hardware_id"] for item in embeddings_dict}

    for _, row in df.iterrows():
        hardware_id = str(row["hardware_id"])

        if hardware_id in valid_hardware_ids:
            document = {
                "@search.action": "mergeOrUpload",
                "hardware_id": hardware_id,
                "text_feedback": str(row["text_feedback"]) if "text_feedback" in row else "",
                "uninstall_text_feedback": str(row["uninstall_text_feedback"]) if "uninstall_text_feedback" in row else "",
                "os": str(row["os"]) if "os" in row else "",
                "date_ymd": str(row["date_ymd"]) if "date_ymd" in row else "",
                "Feature_Category": str(row["Feature_Category"]) if "Feature_Category" in row else "",
                "Sentiment": str(row["Sentiment"]) if "Sentiment" in row else "",
                "country": str(row["country"]) if "country" in row else "",
                "aiid": str(map_aiid_to_label(row["aiid"])) if "aiid" in row else "",
                "version_app": str(row["version_app"]) if "version_app" in row else "",
                "os_version": str(row["version"]) if "version" in row else "",
                "architecture": str(row["architecture"]) if "architecture" in row else "",
                "score": str(row["score"]) if "score" in row else "",
                "region": str(row["region"]) if "region" in row else "",
                "city": str(row["city"]) if "city" in row else "",
                "vector_text_feedback": next(
                    (item["embeddings"].get("vector_text_feedback", []) for item in embeddings_dict if item["hardware_id"] == hardware_id),
                    []
                ),
                "vector_uninstall_feedback": next(
                    (item["embeddings"].get("vector_uninstall_feedback", []) for item in embeddings_dict if item["hardware_id"] == hardware_id),
                    []
                )
            }
            data.append(document)
    
    for chunk in chunk_data(data, chunk_size):
        try:
            search_client.upload_documents(documents=chunk)
            print(f"Uploaded {len(chunk)} documents with embeddings successfully.")
        except Exception as e:
            print(f"An error occurred during embeddings upload: {e}")
            return None

try:
    output_json_path = os.path.join(downloads_folder, "feedback_embeddings.json")
    with open(output_json_path, 'r') as json_file:
        embeddings_dict = json.load(json_file)
    
    upload_documents_to_search_client(df)
    upload_documents_with_embeddings(df, embeddings_dict)
    
    print("All documents and embeddings have been uploaded successfully.")

except Exception as e:
    print(f"An error occurred: {e}")

And I get this as my output:

Uploaded 32000 documents successfully.

Uploaded 28345 documents successfully.

An error occurred during embeddings upload: azure.search.documents._generated.operations._documents_operations.DocumentsOperations.index() got multiple values for keyword argument 'error_map'

All documents and embeddings have been uploaded successfully.

Basically the first function works but the 2nd one isnt. Can anyone help with this?

This is also the format of the json file:

[
    {
        "hardware_id": "example",
        "embeddings": {
            "vector_text_feedback": [],
            "vector_uninstall_feedback": []
        }
    }
]

    {
        "hardware_id": "C286E4952A934E3782D02259E4620AD899F33263848199AB062BD00A2DD2F9AE",
        "embeddings": {
            "vector_text_feedback": [],
            "vector_uninstall_feedback": []
        }
    }
]

Accepted Answer

Hello Aravind Vijayaraghavan,

Welcome to the Microsoft Q&A and thank you for posting your questions here.

I understand that you are in need of help to upload documents involving vector embeddings.

You have done a great job; the error was because of conflicting parameters when vector fields are included.

Start by making sure you have an updated or latest version of azure-search-documents to avoid known bugs using bash: pip install --upgrade azure-search-documents
Make sure the vector fields are lists of floats and that no document keys conflict with SDK parameters.

Now, when calling upload_documents, explicitly pass the documents list to avoid parameter conflicts. The below code is a modified version of your code:

def upload_documents_with_embeddings(df, embeddings_dict, chunk_size=32000):
    """Uploads only hardware_id documents with vector embeddings."""
    data = []
    valid_hardware_ids = {item["hardware_id"] for item in embeddings_dict}
    for _, row in df.iterrows():
        hardware_id = str(row["hardware_id"])
        if hardware_id in valid_hardware_ids:
            # Retrieve embeddings correctly
            embedding_entry = next(
                (item for item in embeddings_dict if item["hardware_id"] == hardware_id),
                None
            )
            if not embedding_entry:
                continue  # Skip if no embedding found
            
            embeddings = embedding_entry.get("embeddings", {})
            
            document = {
                "@search.action": "mergeOrUpload",
                "hardware_id": hardware_id,
                # Include other fields as before
                "vector_text_feedback": embeddings.get("vector_text_feedback", []),
                "vector_uninstall_feedback": embeddings.get("vector_uninstall_feedback", [])
            }
            data.append(document)
    
    for chunk in chunk_data(data, chunk_size):
        try:
            # Explicitly pass documents as a keyword argument
            search_client.upload_documents(documents=chunk)
            print(f"Uploaded {len(chunk)} documents with embeddings successfully.")
        except Exception as e:
            print(f"An error occurred during embeddings upload: {e}")
            return None

I hope this is helpful! Do not hesitate to let me know if you have any other questions.

Please don't forget to close up the thread here by upvoting and accept it as an answer if it is helpful.

Share via

I need help with uploading documents involving vector embeddings?

0 additional answers

Your answer