Hello Aravind Vijayaraghavan,
Welcome to the Microsoft Q&A and thank you for posting your questions here.
I understand that you are in need to upload vector embeddings to an Azure AI Search index.
Regarding the two errors, you will need to ensure that all vector fields are lists and validate the JSON structure before uploading. Check the updated version of your code below for the followings:
- The validate_json_structure function ensures that the JSON structure is valid before attempting to upload.
- The applymap function is updated to ensure all vector fields are lists, even if they contain single values.
import pandas as pd
import json
def validate_json_structure(data):
"""Validate JSON structure to ensure it meets the expected format."""
try:
json.dumps(data)
return True
except (TypeError, ValueError) as e:
print(f"Invalid JSON structure: {e}")
return False
def upload_documents_to_search_client(df, chunk_size=32000):
"""Uploads documents to the search client in chunks."""
data = [
{
"@search.action": "mergeOrUpload",
"id": str(row["id"]),
"vect_dev_exp_feedback": [] if pd.isna(row["vect_dev_exp_feedback"]) else list(row["vect_dev_exp_feedback"]),
"vect_neg_feedback": [] if pd.isna(row["vect_neg_feedback"]) else list(row["vect_neg_feedback"]),
"vect_tools_feedback": [] if pd.isna(row["vect_tools_feedback"]) else list(row["vect_tools_feedback"]),
"vect_wlb_feedback": [] if pd.isna(row["vect_wlb_feedback"]) else list(row["vect_wlb_feedback"]),
"vect_growth_feedback": [] if pd.isna(row["vect_growth_feedback"]) else list(row["vect_growth_feedback"]),
}
for _, row in df.iterrows()
]
if not validate_json_structure(data):
print("Aborting upload due to invalid JSON structure.")
return
for chunk in chunk_data(data, chunk_size):
try:
result = search_client.upload_documents(documents=chunk)
print(f"Uploaded {len(chunk)} documents successfully.")
except Exception as e:
print(f"An error occurred during document upload: {e}")
return None
def chunk_data(data, chunk_size):
"""Yield successive chunks from data."""
for i in range(0, len(data), chunk_size):
yield data[i:i + chunk_size]
# Example usage
df = pd.DataFrame({
"id": [1, 2, 3],
"vect_dev_exp_feedback": [[], [0.1, 0.2], None],
"vect_neg_feedback": [None, [0.3, 0.4], [0.5, 0.6]],
"vect_tools_feedback": [[], [], []],
"vect_wlb_feedback": [None, None, None],
"vect_growth_feedback": [[0.7, 0.8], [], [0.9, 1.0]]
})
# Replace NaN values with empty lists and ensure all vector fields are lists
df = df.applymap(lambda x: [] if pd.isna(x) else list(x) if isinstance(x, (list, tuple)) else [x])
upload_documents_to_search_client(df)
I hope this is helpful! Do not hesitate to let me know if you have any other questions.
Please don't forget to close up the thread here by upvoting and accept it as an answer if it is helpful.