import matplotlib.pyplot as plt
f, ax = plt.subplots()
ax.boxplot(all_scores, vert=False)
ax.set_yticklabels(["one-hot\nencoding", "similarity\nencoding"])
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// We can see that encoding the data using a SimilarityEncoder instead of
// OneHotEncoder helps a lot in improving the cross validation score!
After Change
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=3, random_state=12, shuffle=True)
all_scores = {}
for method in ["one-hot", "similarity"]:
pipeline = make_pipeline(method)
// Now predict the census region of each participant
scores = cross_val_score(pipeline, df, y, cv=cv)
all_scores[method] = scores
print("%s encoding" % method)
print("Accuracy score: mean: %.3f; std: %.3f\n"
% (np.mean(scores), np.std(scores)))
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Plot the results
// ------------------
import seaborn
ax = seaborn.boxplot(data=pd.DataFrame(all_scores), orient="h")
import matplotlib.pyplot as plt
plt.ylabel("Encoding", size=17)
plt.xlabel("Prediction accuracy", size=17)
plt.yticks(size=17)