OneHotEncoder
参数
def __init__(self, n_values=None, categorical_features=None,
categories=None, sparse=True, dtype=np.float64,
handle_unknown='error'):
handle_unknown
数据在转化为 one-hot 编码时,如果遇到一个属性值没有在训练集中出现,程序应该怎么办?如果是 error 的话,程序就报错;如果是 ignore 的话,程序忽略可以继续执行。
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
x_train = [['Male', 1], ['Female', 3], ['Female', 2]]
enc.fit(x_train)
# [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
print(enc.categories_)
x_test = [['neutral', 1], ['Male', 4]]
# [[0. 0. 1. 0. 0.]
# [0. 1. 0. 0. 0.]]
x_test = enc.transform(x_test).toarray()
print(x_test)
sparse
如果是 True,则返回 sparse matrix,如果为 False,则返回 an array。
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
x_train = [['Male', 1], ['Female', 3], ['Female', 2]]
enc.fit(x_train)
x_test = [['neutral', 1], ['Male', 4]]
x_test = enc.transform(x_test)
# (0, 2) 1.0
# (1, 1) 1.0
print(x_test)
enc = OneHotEncoder(handle_unknown='ignore',sparse=False)
x_train = [['Male', 1], ['Female', 3], ['Female', 2]]
enc.fit(x_train)
x_test = [['neutral', 1], ['Male', 4]]
x_test = enc.transform(x_test)
# [[0. 0. 1. 0. 0.]
# [0. 1. 0. 0. 0.]]
print(x_test)
查看每一列属性种类
- categories_
- get_feature_names
# [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
print(enc.categories_)
# ['gender_Female' 'gender_Male' 'group_1' 'group_2' 'group_3']
print(enc.get_feature_names(['gender', 'group']))
inverse_transform
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
x_train = [['Male', 1], ['Female', 3], ['Female', 2]]
enc.fit(x_train)
x_test = enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])
# [['Male' 1]
# [None 2]]
print(x_test)
LabelEncoder
- classes_
- inverse_transform
from sklearn.preprocessing import LabelEncoder
x_train = ["paris", "paris", "tokyo", "amsterdam"]
x_test = ["tokyo", "tokyo", "paris"]
le = LabelEncoder()
le.fit(x_train)
# ['amsterdam' 'paris' 'tokyo']
print(le.classes_)
x_test = le.transform(x_test)
# [2 2 1]
print(x_test)
# ['tokyo' 'tokyo' 'paris']
print(le.inverse_transform([2,2,1]))
网友评论