ConversionsExtensionsCatalog.Hash 方法
定义
重要
一些信息与预发行产品相关,相应产品在发行之前可能会进行重大修改。 对于此处提供的信息,Microsoft 不作任何明示或暗示的担保。
重载
Hash(TransformsCatalog+ConversionTransforms, HashingEstimator+ColumnOptions[]) |
创建一个HashingEstimator,它将输入列的数据类型InputColumnName哈希为新列: Name |
Hash(TransformsCatalog+ConversionTransforms, String, String, Int32, Int32) |
创建一个HashingEstimator,该列将数据从指定 |
Hash(TransformsCatalog+ConversionTransforms, HashingEstimator+ColumnOptions[])
创建一个HashingEstimator,它将输入列的数据类型InputColumnName哈希为新列: Name
public static Microsoft.ML.Transforms.HashingEstimator Hash (this Microsoft.ML.TransformsCatalog.ConversionTransforms catalog, params Microsoft.ML.Transforms.HashingEstimator.ColumnOptions[] columns);
static member Hash : Microsoft.ML.TransformsCatalog.ConversionTransforms * Microsoft.ML.Transforms.HashingEstimator.ColumnOptions[] -> Microsoft.ML.Transforms.HashingEstimator
<Extension()>
Public Function Hash (catalog As TransformsCatalog.ConversionTransforms, ParamArray columns As HashingEstimator.ColumnOptions()) As HashingEstimator
参数
转换的目录。
- columns
- HashingEstimator.ColumnOptions[]
包含输入和输出列名称的估算器的高级选项。 此估算器对文本、数字、布尔值、键和 DataViewRowId 数据类型进行操作。 新列的数据类型将是矢量 UInt32,或 UInt32 基于输入列数据类型是向量还是标量。
返回
示例
using System;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;
namespace Samples.Dynamic
{
// This example demonstrates hashing of categorical string and integer data types by using Hash transform's
// advanced options API.
public static class HashWithOptions
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for
// exception tracking and logging, as well as the source of randomness.
var mlContext = new MLContext(seed: 1);
// Get a small dataset as an IEnumerable.
var rawData = new[] {
new DataPoint() { Category = "MLB" , Age = 18 },
new DataPoint() { Category = "NFL" , Age = 14 },
new DataPoint() { Category = "NFL" , Age = 15 },
new DataPoint() { Category = "MLB" , Age = 18 },
new DataPoint() { Category = "MLS" , Age = 14 },
};
var data = mlContext.Data.LoadFromEnumerable(rawData);
// Construct the pipeline that would hash the two columns and store the
// results in new columns. The first transform hashes the string column
// and the second transform hashes the integer column.
//
// Hashing is not a reversible operation, so there is no way to retrieve
// the original value from the hashed value. Sometimes, for debugging,
// or model explainability, users will need to know what values in the
// original columns generated the values in the hashed columns, since
// the algorithms will mostly use the hashed values for further
// computations. The Hash method will preserve the mapping from the
// original values to the hashed values in the Annotations of the newly
// created column (column populated with the hashed values).
//
// Setting the maximumNumberOfInverts parameters to -1 will preserve the
// full map. If that parameter is left to the default 0 value, the
// mapping is not preserved.
var pipeline = mlContext.Transforms.Conversion.Hash(
new[]
{
new HashingEstimator.ColumnOptions(
"CategoryHashed",
"Category",
16,
useOrderedHashing: false,
maximumNumberOfInverts: -1),
new HashingEstimator.ColumnOptions(
"AgeHashed",
"Age",
8,
useOrderedHashing: false)
});
// Let's fit our pipeline, and then apply it to the same data.
var transformer = pipeline.Fit(data);
var transformedData = transformer.Transform(data);
// Convert the post transformation from the IDataView format to an
// IEnumerable <TransformedData> for easy consumption.
var convertedData = mlContext.Data.CreateEnumerable<
TransformedDataPoint>(transformedData, true);
Console.WriteLine("Category CategoryHashed\t Age\t AgeHashed");
foreach (var item in convertedData)
Console.WriteLine($"{item.Category}\t {item.CategoryHashed}\t\t " +
$"{item.Age}\t {item.AgeHashed}");
// Expected data after the transformation.
//
// Category CategoryHashed Age AgeHashed
// MLB 36206 18 127
// NFL 19015 14 62
// NFL 19015 15 43
// MLB 36206 18 127
// MLS 6013 14 62
// For the Category column, where we set the maximumNumberOfInverts
// parameter, the names of the original categories, and their
// correspondence with the generated hash values is preserved in the
// Annotations in the format of indices and values.the indices array
// will have the hashed values, and the corresponding element,
// position -wise, in the values array will contain the original value.
//
// See below for an example on how to retrieve the mapping.
var slotNames = new VBuffer<ReadOnlyMemory<char>>();
transformedData.Schema["CategoryHashed"].Annotations.GetValue(
"KeyValues", ref slotNames);
var indices = slotNames.GetIndices();
var categoryNames = slotNames.GetValues();
for (int i = 0; i < indices.Length; i++)
Console.WriteLine($"The original value of the {indices[i]} " +
$"category is {categoryNames[i]}");
// Output Data
//
// The original value of the 6012 category is MLS
// The original value of the 19014 category is NFL
// The original value of the 36205 category is MLB
}
public class DataPoint
{
public string Category { get; set; }
public uint Age { get; set; }
}
public class TransformedDataPoint : DataPoint
{
public uint CategoryHashed { get; set; }
public uint AgeHashed { get; set; }
}
}
}
注解
此转换可以对多个列进行操作。
适用于
Hash(TransformsCatalog+ConversionTransforms, String, String, Int32, Int32)
创建一个HashingEstimator,该列将数据从指定inputColumnName
列哈希到新列: outputColumnName
public static Microsoft.ML.Transforms.HashingEstimator Hash (this Microsoft.ML.TransformsCatalog.ConversionTransforms catalog, string outputColumnName, string inputColumnName = default, int numberOfBits = 31, int maximumNumberOfInverts = 0);
static member Hash : Microsoft.ML.TransformsCatalog.ConversionTransforms * string * string * int * int -> Microsoft.ML.Transforms.HashingEstimator
<Extension()>
Public Function Hash (catalog As TransformsCatalog.ConversionTransforms, outputColumnName As String, Optional inputColumnName As String = Nothing, Optional numberOfBits As Integer = 31, Optional maximumNumberOfInverts As Integer = 0) As HashingEstimator
参数
转换转换的目录。
- outputColumnName
- String
由转换 inputColumnName
生成的列的名称。
此列的数据类型将是键的向量,或基于输入列数据类型是矢量还是标量键的标量。
- inputColumnName
- String
要对其数据进行哈希处理的列的名称。
If set to null
, the value of the outputColumnName
will be used as source.
此估算器对文本、数字、布尔值、键或数据类型的向量或 DataViewRowId 标量进行操作。
- numberOfBits
- Int32
要哈希到的位数。 必须介于 1 和 31 之间(含)。
- maximumNumberOfInverts
- Int32
在哈希处理期间,我们在原始值和生成的哈希值之间构造映射。
原始值的文本表示形式存储在新列的批注的槽名称中。因此,哈希可以将许多初始值映射到一个值。
maximumNumberOfInverts
指定映射到应保留的哈希的非重复输入值数的上限。
0 不保留任何输入值。 -1 保留映射到每个哈希的所有输入值。
返回
示例
using System;
using Microsoft.ML;
using Microsoft.ML.Data;
namespace Samples.Dynamic
{
// This example demonstrates hashing of categorical string and integer data types.
public static class Hash
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for
// exception tracking and logging, as well as the source of randomness.
var mlContext = new MLContext(seed: 1);
// Get a small dataset as an IEnumerable.
var rawData = new[] {
new DataPoint() { Category = "MLB" , Age = 18 },
new DataPoint() { Category = "NFL" , Age = 14 },
new DataPoint() { Category = "NFL" , Age = 15 },
new DataPoint() { Category = "MLB" , Age = 18 },
new DataPoint() { Category = "MLS" , Age = 14 },
};
var data = mlContext.Data.LoadFromEnumerable(rawData);
// Construct the pipeline that would hash the two columns and store the
// results in new columns. The first transform hashes the string column
// and the second transform hashes the integer column.
//
// Hashing is not a reversible operation, so there is no way to retrieve
// the original value from the hashed value. Sometimes, for debugging,
// or model explainability, users will need to know what values in the
// original columns generated the values in the hashed columns, since
// the algorithms will mostly use the hashed values for further
// computations. The Hash method will preserve the mapping from the
// original values to the hashed values in the Annotations of the newly
// created column (column populated with the hashed values).
//
// Setting the maximumNumberOfInverts parameters to -1 will preserve the
// full map. If that parameter is left to the default 0 value, the
// mapping is not preserved.
var pipeline = mlContext.Transforms.Conversion.Hash("CategoryHashed",
"Category", numberOfBits: 16, maximumNumberOfInverts: -1)
.Append(mlContext.Transforms.Conversion.Hash("AgeHashed", "Age",
numberOfBits: 8));
// Let's fit our pipeline, and then apply it to the same data.
var transformer = pipeline.Fit(data);
var transformedData = transformer.Transform(data);
// Convert the post transformation from the IDataView format to an
// IEnumerable <TransformedData> for easy consumption.
var convertedData = mlContext.Data.CreateEnumerable<
TransformedDataPoint>(transformedData, true);
Console.WriteLine("Category CategoryHashed\t Age\t AgeHashed");
foreach (var item in convertedData)
Console.WriteLine($"{item.Category}\t {item.CategoryHashed}\t\t " +
$"{item.Age}\t {item.AgeHashed}");
// Expected data after the transformation.
//
// Category CategoryHashed Age AgeHashed
// MLB 36206 18 127
// NFL 19015 14 62
// NFL 19015 15 43
// MLB 36206 18 127
// MLS 6013 14 62
// For the Category column, where we set the maximumNumberOfInverts
// parameter, the names of the original categories, and their
// correspondence with the generated hash values is preserved in the
// Annotations in the format of indices and values.the indices array
// will have the hashed values, and the corresponding element,
// position -wise, in the values array will contain the original value.
//
// See below for an example on how to retrieve the mapping.
var slotNames = new VBuffer<ReadOnlyMemory<char>>();
transformedData.Schema["CategoryHashed"].Annotations.GetValue(
"KeyValues", ref slotNames);
var indices = slotNames.GetIndices();
var categoryNames = slotNames.GetValues();
for (int i = 0; i < indices.Length; i++)
Console.WriteLine($"The original value of the {indices[i]} " +
$"category is {categoryNames[i]}");
// Output Data
//
// The original value of the 6012 category is MLS
// The original value of the 19014 category is NFL
// The original value of the 36205 category is MLB
}
public class DataPoint
{
public string Category { get; set; }
public uint Age { get; set; }
}
public class TransformedDataPoint : DataPoint
{
public uint CategoryHashed { get; set; }
public uint AgeHashed { get; set; }
}
}
}