特征工程-特征构建

  |   0 评论   |   51 浏览

特征构建

  1. 填充缺失值 定性,定量

  2. 分箱

  3. 虚拟变量填充分类列

  4. 分类字符进行编码

xs = [
    {
        'boolean': True,
        'city': 'tokyo',
        'ordinal_column': 'somewhat like',
        'quantitative_column': 1.0
    }
]

import pandas as pd

df = pd.DataFrame({
    'boolean': ['yes', 'no', None, 'no', 'no', 'yes'],
    'city': ['tokyo', None, 'london', 'settle', 'san francisco', 'tokyo'],
    'ordinal_column': ['somewhat like', 'like', 'somewhat like', 'like', 'somewhat like', 'dislike'],
    'quantitative': [1, 11, -0.5, 10, None, 20]
})

print(df.head(6))

print('*' * 40)
print(df.isnull().sum())
print('*' * 40)
print(df['city'].value_counts().index[0])
# boolean 二元分类数据,定类等级
# city 分类数据,定类等级
from sklearn.base import TransformerMixin


class CustomCategoryImputer(TransformerMixin):
    def __init__(self, cols):
        self.cols = cols

    def transform(self, df):
        X = df.copy()
        for col in self.cols:
            X[col].fillna(X[col].value_counts().index[0], inplace=True)
        return X

    def fit(self, *_):
        return self


df_1 = CustomCategoryImputer(['city', 'boolean']).fit_transform(df)
print('*' * 40)
print(df_1)

from sklearn.impute import SimpleImputer


# quantitative_column 定量列,定比等级
class CustomQuantitativeImputer(TransformerMixin):

    def __init__(self, cols=None, strategy='mean'):
        self.cols = cols
        self.strategy = strategy

    def transform(self, df):
        X = df.copy()
        impute = SimpleImputer(strategy=self.strategy)
        for col in self.cols:
            X[col] = impute.fit_transform(X[[col]])
        return X

    def fit(self, *_):
        return self


#  ['mean', 'median', 'most_frequent', 'constant']
df_2 = CustomQuantitativeImputer(cols=['quantitative'], strategy='median').fit_transform(df_1)
print('*' * 40)
print(df_2)

print('*' * 40)


class CustomDummifer(TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def transform(self, df):
        return pd.get_dummies(df, columns=self.cols, )

    def fit(self, *_):
        return self


# c = pd.get_dummies(df_2, columns=['city', 'boolean'], prefix_sep='_')
df_3 = CustomDummifer(['city', 'boolean']).fit_transform(df_2)

print(df_3)
print('*' * 40)


# ordinal_column 顺序数据,定序等级

class CustomEncode(TransformerMixin):
    def __init__(self, col, ordering=None):
        self.ordering = ordering
        self.col = col

    def transform(self, df):
        X = df.copy()
        if isinstance(self.ordering, list):
            X[self.col] = X[self.col].map(lambda x: self.ordering.index(x))
        return X

    def fit(self, *_):
        return self


df_4 = CustomEncode('ordinal_column', ['dislike', 'somewhat like', 'like']).fit_transform(df_2)

print(df_4)

# 对连续性特征进行分箱
print('*' * 40)


class CustomCut(TransformerMixin):

    def __init__(self, col, bins=None, labels=None):
        self.col = col
        self.bins = bins
        self.labels = labels

    def transform(self, df):
        X = df.copy()
        if self.bins:
            X[self.col] = pd.cut(X[self.col], bins=self.bins, labels=self.labels)
        return X

    def fit(self, *_):
        return self


# df_5 = CustomCut('quantitative', bins=[1, 3, 6, 9]).fit_transform(df_2)
df_5 = CustomCut('quantitative', bins=2).fit_transform(df_2)
print(df_5)

if __name__ == '__main__':
    # 创建流水线
    from sklearn.pipeline import Pipeline

    cci = CustomCategoryImputer(cols=['city', 'boolean'])
    cq = CustomQuantitativeImputer(cols=['quantitative'], strategy='median')
    cc = CustomCut(col='quantitative', bins=4, labels=False)
    cd = CustomDummifer(cols=['city', 'boolean'])
    ce = CustomEncode(col='ordinal_column', ordering=['dislike', 'somewhat like', 'like'])

    pipe = Pipeline([
        ('cci', cci),
        ('cq', cq),
        ('cc', cc),
        ('cd', cd),
        ('ce', ce)
    ])
    print('流水线。。。。。')
    df_ = pipe.fit_transform(df)
    print(df_)
    df_.to_csv('index.csv')

扩展数值特征

import pandas as pd
from sklearn.preprocessing import PolynomialFeatures

# 更好的了解原始数据的特征交互情况
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)

df = pd.DataFrame({
    'x1': [0, 1, 2, 3, 4, 5, 6, 7, 8],
    'x2': [11, 12, 13, 14, 15, 16, 17, 18, 19],
    'x3': [0, 0, 1, 1, 1, 1, 1, 0, 0],
})

print(df)

print('============================')
x_p = poly.fit_transform(df)


df_1 = pd.DataFrame(x_p, columns=poly.get_feature_names())
print(df_1)


poly1 = PolynomialFeatures(degree=2, include_bias=True, interaction_only=True)

df_2 = pd.DataFrame(poly1.fit_transform(df), columns=poly1.get_feature_names())
print(df_2)


标题:特征工程-特征构建
作者:lishulongVI
地址:https://blog.thinking.mobi/articles/2020/11/10/1605002204825.html

评论

发表评论