特征工程-特征构建
特征构建
-
填充缺失值 定性,定量
-
分箱
-
虚拟变量填充分类列
-
分类字符进行编码
xs = [
{
'boolean': True,
'city': 'tokyo',
'ordinal_column': 'somewhat like',
'quantitative_column': 1.0
}
]
import pandas as pd
df = pd.DataFrame({
'boolean': ['yes', 'no', None, 'no', 'no', 'yes'],
'city': ['tokyo', None, 'london', 'settle', 'san francisco', 'tokyo'],
'ordinal_column': ['somewhat like', 'like', 'somewhat like', 'like', 'somewhat like', 'dislike'],
'quantitative': [1, 11, -0.5, 10, None, 20]
})
print(df.head(6))
print('*' * 40)
print(df.isnull().sum())
print('*' * 40)
print(df['city'].value_counts().index[0])
# boolean 二元分类数据,定类等级
# city 分类数据,定类等级
from sklearn.base import TransformerMixin
class CustomCategoryImputer(TransformerMixin):
def __init__(self, cols):
self.cols = cols
def transform(self, df):
X = df.copy()
for col in self.cols:
X[col].fillna(X[col].value_counts().index[0], inplace=True)
return X
def fit(self, *_):
return self
df_1 = CustomCategoryImputer(['city', 'boolean']).fit_transform(df)
print('*' * 40)
print(df_1)
from sklearn.impute import SimpleImputer
# quantitative_column 定量列,定比等级
class CustomQuantitativeImputer(TransformerMixin):
def __init__(self, cols=None, strategy='mean'):
self.cols = cols
self.strategy = strategy
def transform(self, df):
X = df.copy()
impute = SimpleImputer(strategy=self.strategy)
for col in self.cols:
X[col] = impute.fit_transform(X[[col]])
return X
def fit(self, *_):
return self
# ['mean', 'median', 'most_frequent', 'constant']
df_2 = CustomQuantitativeImputer(cols=['quantitative'], strategy='median').fit_transform(df_1)
print('*' * 40)
print(df_2)
print('*' * 40)
class CustomDummifer(TransformerMixin):
def __init__(self, cols):
self.cols = cols
def transform(self, df):
return pd.get_dummies(df, columns=self.cols, )
def fit(self, *_):
return self
# c = pd.get_dummies(df_2, columns=['city', 'boolean'], prefix_sep='_')
df_3 = CustomDummifer(['city', 'boolean']).fit_transform(df_2)
print(df_3)
print('*' * 40)
# ordinal_column 顺序数据,定序等级
class CustomEncode(TransformerMixin):
def __init__(self, col, ordering=None):
self.ordering = ordering
self.col = col
def transform(self, df):
X = df.copy()
if isinstance(self.ordering, list):
X[self.col] = X[self.col].map(lambda x: self.ordering.index(x))
return X
def fit(self, *_):
return self
df_4 = CustomEncode('ordinal_column', ['dislike', 'somewhat like', 'like']).fit_transform(df_2)
print(df_4)
# 对连续性特征进行分箱
print('*' * 40)
class CustomCut(TransformerMixin):
def __init__(self, col, bins=None, labels=None):
self.col = col
self.bins = bins
self.labels = labels
def transform(self, df):
X = df.copy()
if self.bins:
X[self.col] = pd.cut(X[self.col], bins=self.bins, labels=self.labels)
return X
def fit(self, *_):
return self
# df_5 = CustomCut('quantitative', bins=[1, 3, 6, 9]).fit_transform(df_2)
df_5 = CustomCut('quantitative', bins=2).fit_transform(df_2)
print(df_5)
if __name__ == '__main__':
# 创建流水线
from sklearn.pipeline import Pipeline
cci = CustomCategoryImputer(cols=['city', 'boolean'])
cq = CustomQuantitativeImputer(cols=['quantitative'], strategy='median')
cc = CustomCut(col='quantitative', bins=4, labels=False)
cd = CustomDummifer(cols=['city', 'boolean'])
ce = CustomEncode(col='ordinal_column', ordering=['dislike', 'somewhat like', 'like'])
pipe = Pipeline([
('cci', cci),
('cq', cq),
('cc', cc),
('cd', cd),
('ce', ce)
])
print('流水线。。。。。')
df_ = pipe.fit_transform(df)
print(df_)
df_.to_csv('index.csv')
扩展数值特征
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
# 更好的了解原始数据的特征交互情况
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
df = pd.DataFrame({
'x1': [0, 1, 2, 3, 4, 5, 6, 7, 8],
'x2': [11, 12, 13, 14, 15, 16, 17, 18, 19],
'x3': [0, 0, 1, 1, 1, 1, 1, 0, 0],
})
print(df)
print('============================')
x_p = poly.fit_transform(df)
df_1 = pd.DataFrame(x_p, columns=poly.get_feature_names())
print(df_1)
poly1 = PolynomialFeatures(degree=2, include_bias=True, interaction_only=True)
df_2 = pd.DataFrame(poly1.fit_transform(df), columns=poly1.get_feature_names())
print(df_2)