Here is a snapshot of my results:

The complete reproducible solution can be downloaded from: https://github.com/CBrauer/Transform-and-AutoML
using System;
using System.Diagnostics;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.AutoML;
using Microsoft.ML.Data;
using MLLibrary;
namespace BottleRocketClassify {
internal static class Program {
public const string trainedModelPath = "../../../MLModel.zip";
#region PrintMetrics
#region BinaryExperimentProgressHandler
#region Head
public static class ModelBuilder {
public static void Run() {
var mlContext = new MLContext(seed: 1);
var trainDataView = mlContext.Data.LoadFromTextFile<ModelInput>(
path: @"H:/HedgeTools/Datasets/rocket-train-classify.csv",
// path: @"../../../rocket-train-classify.csv",
hasHeader: true,
separatorChar: ',');
Head(trainDataView, 5);
var validDataView = mlContext.Data.LoadFromTextFile<ModelInput>(
path: @"H:/HedgeTools/Datasets/rocket-valid-classify.csv",
hasHeader: true,
separatorChar: ',');
var pipeline = mlContext.Transforms.NormalizeMeanVariance("BoxRatio")
.Append(mlContext.Transforms.NormalizeMeanVariance("Thrust"))
.Append(mlContext.Transforms.NormalizeMeanVariance("Acceleration"))
.Append(mlContext.Transforms.NormalizeMeanVariance("Velocity"))
.Append(mlContext.Transforms.NormalizeMeanVariance("vwapGain"))
.Append(mlContext.Transforms.NormalizeMeanVariance("OnBalRun"))
.AppendCacheCheckpoint(mlContext);
var model = pipeline.Fit(trainDataView);
var transformedTrainDataView = model.Transform(trainDataView);
Head(transformedTrainDataView, 5);
var optimizingMetrics = new BinaryClassificationMetric[4];
optimizingMetrics[0] = BinaryClassificationMetric.F1Score;
optimizingMetrics[1] = BinaryClassificationMetric.AreaUnderRocCurve;
optimizingMetrics[2] = BinaryClassificationMetric.AreaUnderPrecisionRecallCurve;
optimizingMetrics[3] = BinaryClassificationMetric.PositiveRecall;
var trainers = new BinaryClassificationTrainer[1];
trainers[0] = BinaryClassificationTrainer.FastTree;
// trainers[1] = BinaryClassificationTrainer.LightGbm;
var bestMetric = 0.0;
foreach (var trainer in trainers) {
foreach (var optimizingMetric in optimizingMetrics) {
var sw = Stopwatch.StartNew();
var settings = new BinaryExperimentSettings {
MaxExperimentTimeInSeconds = 1 * 60 * 60,
OptimizingMetric = optimizingMetric,
CacheDirectory = null
};
settings.Trainers.Clear();
settings.Trainers.Add(trainer);
Console.WriteLine("\n_____________________________________________________________________________\n" +
"Running AutoML binary classification experimeent using: " +
trainer + ", " + optimizingMetric);
var experimentResult = mlContext.Auto()
.CreateBinaryClassificationExperiment(settings)
.Execute(trainData: transformedTrainDataView,
// .Execute(trainData: trainDataView,
validDataView,
labelColumnName: "Altitude",
progressHandler: new BinaryExperimentProgressHandler());
var bestRun = experimentResult.BestRun;
Console.WriteLine("Total models produced.... {0}", experimentResult.RunDetails.Count());
var validDataViewWithBestScore = bestRun.Model.Transform(validDataView);
var validMetrics = mlContext.BinaryClassification.
EvaluateNonCalibrated(data: validDataViewWithBestScore,
labelColumnName: "Altitude");
// Console.WriteLine("\nMetrics using validation dataset:");
// PrintMetrics(validMetrics);
var crossValidationResults = mlContext.BinaryClassification
.CrossValidateNonCalibrated(validDataView,
bestRun.Estimator,
numberOfFolds: 10,
labelColumnName: "Altitude");
var metricsInMultipleFolds = crossValidationResults.Select(r => r.Metrics);
var AccuracyValues = metricsInMultipleFolds.Select(m => m.Accuracy);
var accuracyValues = AccuracyValues as double[] ?? AccuracyValues.ToArray();
var AccuracyAverage = accuracyValues.Average();
var F1Values = metricsInMultipleFolds.Select(m => m.F1Score);
var f1Values = F1Values as double[] ?? F1Values.ToArray();
var F1Average = f1Values.Average();
var AUCValues = metricsInMultipleFolds.Select(m => m.AreaUnderRocCurve);
var aucValues = AUCValues as double[] ?? AUCValues.ToArray();
var AUCAverage = aucValues.Average();
var AUCPRCValues = metricsInMultipleFolds.Select(m => m.AreaUnderPrecisionRecallCurve);
var aucPRCValues = AUCPRCValues as double[] ?? AUCPRCValues.ToArray();
var AUCPRCAverage = aucPRCValues.Average();
var sumOfSquaresOfDifferences = accuracyValues.Select(val => (val - AccuracyAverage) * (val - AccuracyAverage)).Sum();
var AccuraciesStdDeviation = Math.Sqrt(sumOfSquaresOfDifferences / (accuracyValues.Length - 1));
var confidenceInterval95 = 1.96 * AccuraciesStdDeviation / Math.Sqrt((accuracyValues.Length - 1));
var AccuraciesConfidenceInterval95 = confidenceInterval95;
Console.WriteLine("CrossValidation Metrics using the validation dataset:");
Console.WriteLine(" trainer......................... {0}", bestRun.TrainerName);
Console.WriteLine(" optimizingMetric................ {0}", optimizingMetric);
Console.WriteLine(" AccuracyAverage................. {0}", AccuracyAverage);
Console.WriteLine(" F1Average....................... {0}", F1Average);
Console.WriteLine(" AUCAverage...................... {0}", AUCAverage);
Console.WriteLine(" AUCPRCAverage................... {0}", AUCPRCAverage);
Console.WriteLine(
" Cross Validation, AUC........... {0:f4}, Standard deviation: {1:f4}, Confidence Interval 95%: {2:f4}",
AccuracyAverage, AccuraciesStdDeviation, AccuraciesConfidenceInterval95);
if (AccuracyAverage > bestMetric) {
bestMetric = AccuracyAverage;
var bestTrainer = bestRun.TrainerName;
var bestOptimizingMetric = optimizingMetric.ToString();
Console.WriteLine("\n Best model's trainer............... {0}", bestTrainer);
Console.WriteLine("Best model's optimizingMetric........ {0}", bestOptimizingMetric);
Console.WriteLine("Best model's AccuracyAverage......... {0}", AccuracyAverage);
Console.WriteLine("Best model's F1Average............... {0}", F1Average);
Console.WriteLine("Best model's AUCAverage.............. {0}", AUCAverage);
Console.WriteLine("Best model's AUCPRCAverage........... {0}", AUCPRCAverage);
var mlModel = bestRun.Model;
mlContext.Model.Save(mlModel, trainDataView.Schema, trainedModelPath);
Console.WriteLine("The model is saved.");
Console.WriteLine("\n_____________________________________________________________________________");
var savedModel = mlContext.Model.Load(trainedModelPath, out _);
var validDataViewWithBestScore2 = savedModel.Transform(validDataView);
var validMetrics2 = mlContext.BinaryClassification.EvaluateNonCalibrated(data: validDataViewWithBestScore2,
labelColumnName: "Altitude");
Console.WriteLine("\nConfusion Matrix from saved model using the validation dataset:\n{0}",
validMetrics2.ConfusionMatrix.GetFormattedConfusionTable());
}
sw.Stop();
var ts = sw.Elapsed;
Console.WriteLine(("Experiment time: {0:00}:{1:00}:{2:00}", ts.Hours, ts.Minutes, ts.Seconds));
}
}
Console.WriteLine("Done.");
}
}
private static void Main() {
ModelBuilder.Run();
var testDataset = @"H:/HedgeTools/Datasets/rocket-test-classify.csv";
Verify.Model(trainedModelPath, testDataset);
Console.WriteLine("Done.");
Console.ReadKey();
}
}
}
Any suggestions or help will be greatly appreciated.
Charles
Hi @CBrauer ,
I have tried to open your repro, however Visual Studio fails to upload your project with the following error:
C:\Users\MLLibrary\MLLibrary.csproj : error : The project file could not be loaded. Could not find a part of the path 'C:\Users\MLLibrary\MLLibrary.csproj'. C:\Users\MLLibrary\MLLibrary.csproj
C:\Users\MLLibrary\MLLibrary.csproj : error : The project file 'C:\Users\MLLibrary\MLLibrary.csproj' has been moved, renamed or is not on your computer.
I also can't find MLLibrary.csproj or anything related to MLLibrary in your repro. Can you please add all necessary files and packages, and share your complete repro with us? Thanks.
Sorry 'bout that. Please download again.
You'll want to add the NormalizeMeanVariance sub-pipeline as your preFeaturizer in the AutoML .Execute() call.
See step 3 & 7 in the AutoML Advanced Experiment sample.
In your example above, you're applying the NormalizeMeanVariance to your training dataset and training the model on the normalized data. The normalization transform is in separate pipeline and not in your model. When you add the NormalizeMeanVariance pipeline as the preFeaturizer, it will be included in the created/saved model.
You could keep two models around, and always remember to use the first to normalize your data, then feed to the second. But a singular model is more simple. The preFeaturizer input to AutoML is designed to create a unified pipeline which prepends the given transforms, and correctly manages its internal data splits to ensure information isn't leaked during the fitting process.
I looked through the source code for ML.NET and I see a lot of work on hyperparameter optimization with "sweeper". Am I wasting my time doing a preFeaturizer with AutoML?.
In most cases you can improve your model by exploring additional feature engineering techniques. The preFeaturizer allows you to explore additional featurization in a safe way. It should be easy enough to test if the NormalizeMeanVariance adds value.
A lot of it will come down to which feature engineering techniques you try. Here are a few of my favorites: https://github.com/dotnet/machinelearning-modelbuilder/issues/702#issuecomment-628283455
Thanks for you comments.
Thank you @justinormont for your help! @CBrauer I'm closing this issue as it seems your issue has been resolved. If you have any additional inquires on this, please free free to comment here. Thanks!
Most helpful comment
Thanks for you comments.