Hello everyone,
I am trying to set up an indexer that will index documents from a data source (Azure Blob Storage) to an Azure Search Index. I have also created a skillset that contains a chunking and an embedding skill. Specifically these are the "Microsoft.Skills.Text.SplitSkill" and the "Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill".
What ends up happening is that the documents get indexed in the index I have set up, but neither the chunking nor the embedding happens. The documents are indexed as is, without getting chunked, and there are no embeddings in my index. I have also set the "outputFieldMappings" to map the output of the embedding skill to the "embedding" field in the index.
Still, no luck. I will provide the Indexer and Skillset .json below.
Indexer
{
"@odata.context": "<redacted>",
"@odata.etag": "<redacted>",
"name": "<redacted>",
"description": null,
"dataSourceName": "azureblob-1737725881239-datasource",
"skillsetName": "document-chunk-and-embedding-skillset",
"targetIndexName": "<redacted>",
"disabled": null,
"schedule": null,
"parameters": {
"batchSize": null,
"maxFailedItems": null,
"maxFailedItemsPerBatch": null,
"base64EncodeKeys": null,
"configuration": {
"dataToExtract": "contentAndMetadata",
"parsingMode": "default"
}
},
"fieldMappings": [],
"outputFieldMappings": [
{
"sourceFieldName": "/document/myEmbedding",
"targetFieldName": "embedding",
"mappingFunction": null
}
],
"cache": null,
"encryptionKey": null
}
Skillset
{
"@odata.etag": "<redacted>",
"name": "document-chunk-and-embedding-skillset",
"description": "",
"skills": [
{
"@odata.type": "#Microsoft.Skills.Text.SplitSkill",
"name": "Document Chunk Splitter Skill",
"description": "This skill is used to split documents into chunks",
"context": "/document",
"defaultLanguageCode": "en",
"textSplitMode": "pages",
"maximumPageLength": 512,
"pageOverlapLength": 102,
"maximumPagesToTake": 0,
"unit": "azureOpenAITokens",
"inputs": [
{
"name": "text",
"source": "/document/content",
"inputs": []
}
],
"outputs": [
{
"name": "textItems",
"targetName": "chunks"
}
],
"azureOpenAITokenizerParameters": {
"encoderModelName": "cl100k_base",
"allowedSpecialTokens": []
}
},
{
"@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
"name": "Chunk Embedding Skill",
"description": "This skill creates embeddings for each chunk created from the documents.",
"context": "/document/chunks/*",
"resourceUri": "<redacted>",
"apiKey": "<redacted>",
"deploymentId": "<redacted>",
"dimensions": 1536,
"modelName": "text-embedding-ada-002",
"inputs": [
{
"name": "text",
"source": "/document/chunks",
"inputs": []
}
],
"outputs": [
{
"name": "embedding",
"targetName": "myEmbedding"
}
]
}
],
"cognitiveServices": {
"@odata.type": "#Microsoft.Azure.Search.AIServicesByKey",
"subdomainUrl": "<redacted>"
}
}
I would appreciate any help with this matter.
Thanks in advance!!!!