30a84d87f185185a7256231097a461e2a255247d,dirty_cat/test/test_dirty_cat.py,,test_similarity_encoder,#,7
Before Change
X = np.array(["aa", "aaa", "aaab"]).reshape(-1, 1)
X_test = np.array([["aa", "aaa", "aaa", "aaab", "aaac"]]).reshape(-1, 1)
model.fit(X)
encoder = model.transform(X_test)
ans = np.zeros((len(X_test), len(X)))
for i, x_t in enumerate(X_test.reshape(-1)):
for j, x in enumerate(X.reshape(-1)):
ans[i, j] = lev.ratio(x_t, x)
After Change
X = np.array(["aa", "aaa", "aaab"]).reshape(-1, 1)
X_test = np.array([["aa", "aaa", "aaa", "aaab", "aaac"]]).reshape(-1, 1)
similarity_types = [
"levenshtein-ratio",
"jaro-winkler"
]
for similarity_type in similarity_types:
model = similarity_encoder.SimilarityEncoder(
similarity_type=similarity_type, handle_unknown="ignore")
encoder = model.fit(X).transform(X_test)
if similarity_type == "levenshtein-ratio":
ans = np.zeros((len(X_test), len(X)))
for i, x_t in enumerate(X_test.reshape(-1)):
for j, x in enumerate(X.reshape(-1)):
ans[i, j] = lev.ratio(x_t, x)
assert np.array_equal(encoder, ans)
if similarity_type == "jaro-winkler":
ans = np.zeros((len(X_test), len(X)))
for i, x_t in enumerate(X_test.reshape(-1)):
for j, x in enumerate(X.reshape(-1)):
ans[i, j] = jaro_distance(x_t, x)
assert np.array_equal(encoder, ans)
In pattern: SUPERPATTERN
Frequency: 3
Non-data size: 5
Instances Project Name: dirty-cat/dirty_cat
Commit Name: 30a84d87f185185a7256231097a461e2a255247d
Time: 2018-03-14
Author: patricio.cerda@inria.fr
File Name: dirty_cat/test/test_dirty_cat.py
Class Name:
Method Name: test_similarity_encoder
Project Name: dirty-cat/dirty_cat
Commit Name: ff8dccc29edca64877b3cef1b53c7958ce321f76
Time: 2018-03-14
Author: patricio.cerda@inria.fr
File Name: dirty_cat/test/test_similarity_encoder.py
Class Name:
Method Name: test_similarity_encoder
Project Name: undertheseanlp/underthesea
Commit Name: cf98094a2b174b5df0cab0ad0a02c0f82f6ab29e
Time: 2017-10-10
Author: brother.rain.1024@gmail.com
File Name: underthesea/word_sent/model.py
Class Name: CRFModel
Method Name: predict