from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=3, random_state=12, shuffle=True)
all_scores = []
for method in ["one-hot", "similarity"]:
pipeline = make_pipeline(method)
// Now predict the census region of each participant
scores = cross_val_score(pipeline, df, y, cv=cv)
After Change
import matplotlib.pyplot as plt
plt.ylabel("Encoding", size=17)
plt.xlabel("Prediction accuracy", size=17)
plt.yticks(size=17)
plt.tight_layout()
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// We can see that encoding the data using a SimilarityEncoder instead of