项目_kaggle房价预测01

ipynb 转化(对应notebook文件(图片路径需要重新生成):python_myproject/kaggle_housePrice/house_price01.ipynb)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
%run MyTools.py
import re as re
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
import re as re
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
from scipy.stats import skew
from scipy.stats import norm
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline

加载训练和与测试数据

1
2
3
4
5
6
7
8
#load原始CSV数据
print ('Read Data From File')
# FILE_DIR='/media/john/10167DFA167DE0E01/TDDOWNLOAD/ML/titanic/officialData'
FILE_DIR='/home/ds/notebooks/kaggle_house_price/officialData'

data_train = pd.read_csv('{0}/train.csv'.format(FILE_DIR))
data_test=pd.read_csv('{0}/test.csv'.format(FILE_DIR))
data_train.head()
Read Data From File

IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities...PoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveNaNRegLvlAllPub...0NaNNaNNaN022008WDNormal208500
1220RL80.09600PaveNaNRegLvlAllPub...0NaNNaNNaN052007WDNormal181500
2360RL68.011250PaveNaNIR1LvlAllPub...0NaNNaNNaN092008WDNormal223500
3470RL60.09550PaveNaNIR1LvlAllPub...0NaNNaNNaN022006WDAbnorml140000
4560RL84.014260PaveNaNIR1LvlAllPub...0NaNNaNNaN0122008WDNormal250000

5 rows × 81 columns

合并训练预测数据

1
2
3
4
5
6
7
8
9
10
11
print ('View Data')
#view data
column_id='Id'
column_label='SalePrice'
train_id=data_train[column_id]
test_id=data_test[column_id]

data_full=data_train.append(data_test, ignore_index=True)
data_full[column_label]=data_full[column_label].fillna(1).astype(int)

data_full.head()
View Data

1stFlrSF2ndFlrSF3SsnPorchAlleyBedroomAbvGrBldgTypeBsmtCondBsmtExposureBsmtFinSF1BsmtFinSF2...SaleTypeScreenPorchStreetTotRmsAbvGrdTotalBsmtSFUtilitiesWoodDeckSFYearBuiltYearRemodAddYrSold
08568540NaN31FamTANo706.00.0...WD0Pave8856.0AllPub0200320032008
1126200NaN31FamTAGd978.00.0...WD0Pave61262.0AllPub298197619762007
29208660NaN31FamTAMn486.00.0...WD0Pave6920.0AllPub0200120022008
39617560NaN31FamGdNo216.00.0...WD0Pave7756.0AllPub0191519702006
4114510530NaN41FamTAAv655.00.0...WD0Pave91145.0AllPub192200020002008

5 rows × 81 columns

待预测特征观察

  1. 数据分布概况
  2. 峰值和偏差
  3. 数据分布图
1
2
3
4
5
6
7
print data_full[data_full[column_id].isin(id_train)][column_label].describe()
#skewness and kurtosis
print("Skewness: %f" % data_full[data_full[column_id].isin(id_train)][column_label].skew())
print("Kurtosis: %f" % data_full[data_full[column_id].isin(id_train)][column_label].kurt())
# 在统计学中,峰度(Kurtosis)衡量实数随机变量概率分布的峰态。峰度高就意味着方差增大是由低频度的大于或小于平均值的极端差值引起的。
FeatureEngineerTools.show_contin_columns(data_train,[column_label])
res = stats.probplot(data_train[column_label], plot=plt)
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64
Skewness: 1.882876
Kurtosis: 6.536282

log变换

1
2
3
4
5
data_full[column_label]=data_full[column_label].apply(np.log)

data_train=data_full[data_full[column_id].isin(train_id)]
FeatureEngineerTools.show_contin_columns(data_train,[column_label])
res = stats.probplot(data_train[column_label], plot=plt)

特征稍多,无法依次观察,先观察特征数据类型

1
2
3
4
5
quantitative = [f for f in data_full.columns if data_full.dtypes[f] != 'object']
qualitative = [f for f in data_full.columns if data_full.dtypes[f] == 'object']
print("all: {} quantitative: {}({}), qualitative: {}({})"
.format (len(quantitative)+len(qualitative),len(quantitative),float(len(quantitative))/(len(quantitative)+len(qualitative))
,len(qualitative),float(len(qualitative))/(len(quantitative)+len(qualitative))))
all: 81 quantitative: 38(0.469135802469), qualitative: 43(0.530864197531)

特征缺失值分布情况

1
2
3
4
5
6
7
8
9
missing = data_full.isnull().sum().sort_values(ascending=False)
print 'missing feature count:',missing[missing>0].size
print 'missing feature count > 10:',missing[missing>10].size
missingInfo=pd.DataFrame()
missingInfo['missing_count']=missing[missing>0]
missingInfo['missing_rate']=missingInfo['missing_count']/data_full.shape[0]
missingInfo['missing_type']=data_full[missingInfo['missing_count'].index].dtypes
print missingInfo[missingInfo['missing_count']>10]
missingInfo[missingInfo['missing_count']>10].plot.bar()
missing feature count: 34
missing feature count > 10: 18
              missing_count  missing_rate missing_type
PoolQC                 2909      0.996574       object
MiscFeature            2814      0.964029       object
Alley                  2721      0.932169       object
Fence                  2348      0.804385       object
FireplaceQu            1420      0.486468       object
LotFrontage             486      0.166495      float64
GarageCond              159      0.054471       object
GarageFinish            159      0.054471       object
GarageQual              159      0.054471       object
GarageYrBlt             159      0.054471      float64
GarageType              157      0.053786       object
BsmtCond                 82      0.028092       object
BsmtExposure             82      0.028092       object
BsmtQual                 81      0.027749       object
BsmtFinType2             80      0.027407       object
BsmtFinType1             79      0.027064       object
MasVnrType               24      0.008222       object
MasVnrArea               23      0.007879      float64





<matplotlib.axes._subplots.AxesSubplot at 0x7fb481296410>

观察特征和预测目标的相关度

1
2
3
4
5
6
7
8
9
10
11
12
# corrmat = data_train.corr()
# k = 10 #number of variables for heatmap
# cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
# cm = np.corrcoef(data_train[cols].values.T)
# sns.set(font_scale=1.25)
# hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
# plt.show()

# %run MyTools.py
data_train=data_full[data_full[column_id].isin(train_id)]
goodFeature=FeatureEngineerTools.heatmap(data_train,column_label,k=10)
print 'goodFeature:',goodFeature
goodFeature: ['SalePrice' 'OverallQual' 'GrLivArea' 'GarageCars' 'GarageArea'
 'TotalBsmtSF' '1stFlrSF' 'FullBath' 'YearBuilt' 'YearRemodAdd']

1
2
print missingInfo.index.intersection(goodFeature)
missingInfo.loc[missingInfo.index.intersection(goodFeature)]
Index([u'GarageCars', u'TotalBsmtSF', u'GarageArea'], dtype='object')

missing_countmissing_ratemissing_type
GarageCars10.000343float64
TotalBsmtSF10.000343float64
GarageArea10.000343float64

相关性最的10个属性中,3个属性有缺失,但只有一个,最简单处理方法,丢弃

需要检查缺失的数据不再测试集data_test中

不巧,不好直接丢弃,缺失数据恰好在测试集中存在

处理缺失数据

1
2
data_test=data_full[data_full[column_id].isin(test_id)]
data_test[[u'GarageCars', u'TotalBsmtSF', u'GarageArea']].isnull().sum()
GarageCars     1
TotalBsmtSF    1
GarageArea     1
dtype: int64
1
2
data_full = data_full.drop((missingInfo[missingInfo['missing_count'] > 50]).index,1)
print data_full.isnull().sum().count(),data_full.isnull().sum().max() #just checking that there's no missing data missing...
65 24
1
2
for column in data_full.columns.intersection(quantitative):
data_full[column]=data_full[column].fillna(data_full[column].dropna().mean())
1
print data_full.isnull().sum().count(),data_full.isnull().sum().max()
65 24
1
2
3
# f = pd.melt(data_full, value_vars=quantitative)
# g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False)
# g = g.map(sns.distplot, "value")

定量属性的偏差

1
2
quantitative=data_full.columns.intersection(quantitative)
data_full[quantitative].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
MiscVal          21.947195
PoolArea         16.898328
LotArea          12.822431
LowQualFinSF     12.088761
3SsnPorch        11.376065
KitchenAbvGr      4.302254
BsmtFinSF2        4.146034
EnclosedPorch     4.003891
ScreenPorch       3.946694
BsmtHalfBath      3.931343
MasVnrArea        2.611549
OpenPorchSF       2.535114
WoodDeckSF        1.842433
1stFlrSF          1.469604
BsmtFinSF1        1.425233
MSSubClass        1.375457
GrLivArea         1.269358
TotalBsmtSF       1.162484
BsmtUnfSF         0.919508
2ndFlrSF          0.861675
TotRmsAbvGrd      0.758367
Fireplaces        0.733495
HalfBath          0.694566
BsmtFullBath      0.623955
OverallCond       0.570312
BedroomAbvGr      0.326324
GarageArea        0.241218
OverallQual       0.197110
MoSold            0.195884
FullBath          0.167606
YrSold            0.132399
SalePrice         0.005930
Id                0.000000
GarageCars       -0.218298
YearRemodAdd     -0.451020
YearBuilt        -0.599806
dtype: float64
1
2
# train = all_df.loc[train_df.index]
# train['SalePrice'] = train_df.SalePrice

一元方差分析p值

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
def anova(frame):
anv = pd.DataFrame()
anv['feature'] = qualitative
pvals = []
for c in qualitative:
samples = []
for cls in frame[c].unique():
s = frame[frame[c] == cls]['SalePrice'].values
samples.append(s)
pval = stats.f_oneway(*samples)[1]
pvals.append(pval)
anv['pval'] = pvals
return anv.sort_values('pval')

data_train=data_full[data_full[column_id].isin(train_id)].copy()
qualitative=data_train.columns.intersection(qualitative)

print
a = anova(data_train)
a['disparity'] = np.log(1./a['pval'].values)
sns.barplot(data=a, x='feature', y='disparity')
x=plt.xticks(rotation=90)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
def encode(frame, feature):
ordering = pd.DataFrame()
ordering['val'] = frame[feature].unique()
ordering.index = ordering.val
ordering['spmean'] = frame[[feature, 'SalePrice']].groupby(feature).mean()['SalePrice']
ordering = ordering.sort_values('spmean')
ordering['ordering'] = range(1, ordering.shape[0]+1)
ordering = ordering['ordering'].to_dict()

frame[feature+'_E']=frame[feature].map(ordering)
# for cat, o in ordering.items():
# frame.loc[frame[feature] == cat, feature+'_E'] = o

qual_encoded = []
for q in qualitative:
encode(data_train, q)
qual_encoded.append(q+'_E')
print(qual_encoded)
['BldgType_E', 'CentralAir_E', 'Condition1_E', 'Condition2_E', 'Electrical_E', 'ExterCond_E', 'ExterQual_E', 'Exterior1st_E', 'Exterior2nd_E', 'Foundation_E', 'Functional_E', 'Heating_E', 'HeatingQC_E', 'HouseStyle_E', 'KitchenQual_E', 'LandContour_E', 'LandSlope_E', 'LotConfig_E', 'LotShape_E', 'MSZoning_E', 'MasVnrType_E', 'Neighborhood_E', 'PavedDrive_E', 'RoofMatl_E', 'RoofStyle_E', 'SaleCondition_E', 'SaleType_E', 'Street_E', 'Utilities_E']
1
2
3
print data_train.columns
print data_train[u'Condition2_E'].value_counts()
print data_train[u'Condition2'].value_counts()
Index([u'1stFlrSF', u'2ndFlrSF', u'3SsnPorch', u'BedroomAbvGr', u'BldgType',
       u'BsmtFinSF1', u'BsmtFinSF2', u'BsmtFullBath', u'BsmtHalfBath',
       u'BsmtUnfSF', u'CentralAir', u'Condition1', u'Condition2',
       u'Electrical', u'EnclosedPorch', u'ExterCond', u'ExterQual',
       u'Exterior1st', u'Exterior2nd', u'Fireplaces', u'Foundation',
       u'FullBath', u'Functional', u'GarageArea', u'GarageCars', u'GrLivArea',
       u'HalfBath', u'Heating', u'HeatingQC', u'HouseStyle', u'Id',
       u'KitchenAbvGr', u'KitchenQual', u'LandContour', u'LandSlope',
       u'LotArea', u'LotConfig', u'LotShape', u'LowQualFinSF', u'MSSubClass',
       u'MSZoning', u'MasVnrArea', u'MasVnrType', u'MiscVal', u'MoSold',
       u'Neighborhood', u'OpenPorchSF', u'OverallCond', u'OverallQual',
       u'PavedDrive', u'PoolArea', u'RoofMatl', u'RoofStyle', u'SaleCondition',
       u'SalePrice', u'SaleType', u'ScreenPorch', u'Street', u'TotRmsAbvGrd',
       u'TotalBsmtSF', u'Utilities', u'WoodDeckSF', u'YearBuilt',
       u'YearRemodAdd', u'YrSold', u'BldgType_E', u'CentralAir_E',
       u'Condition1_E', u'Condition2_E', u'Electrical_E', u'ExterCond_E',
       u'ExterQual_E', u'Exterior1st_E', u'Exterior2nd_E', u'Foundation_E',
       u'Functional_E', u'Heating_E', u'HeatingQC_E', u'HouseStyle_E',
       u'KitchenQual_E', u'LandContour_E', u'LandSlope_E', u'LotConfig_E',
       u'LotShape_E', u'MSZoning_E', u'MasVnrType_E', u'Neighborhood_E',
       u'PavedDrive_E', u'RoofMatl_E', u'RoofStyle_E', u'SaleCondition_E',
       u'SaleType_E', u'Street_E', u'Utilities_E'],
      dtype='object')
5    1445
3       6
7       2
2       2
1       2
8       1
6       1
4       1
Name: Condition2_E, dtype: int64
Norm      1445
Feedr        6
Artery       2
RRNn         2
PosN         2
RRAn         1
RRAe         1
PosA         1
Name: Condition2, dtype: int64
1
2
3
4
5
# missing_data = all_df.isnull().sum()
# missing_data = missing_data[missing_data>0]
# ids = all_df[missing_data.index].isnull()
# # index (0), columns (1)
# all_df.loc[ids[ids.any(axis=1)].index][missing_data.index]
1
# train.loc[1379,'Electrical_E']
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def spearman(frame, features):
spr = pd.DataFrame()
spr['feature'] = features
#Signature: a.corr(other, method='pearson', min_periods=None)
#Docstring:
#Compute correlation with `other` Series, excluding missing values
# 计算特征和 SalePrice的 斯皮尔曼 相关系数
spr['spearman'] = [frame[f].corr(frame['SalePrice'], 'spearman') for f in features]
spr = spr.sort_values('spearman')
plt.figure(figsize=(6, 0.25*len(features))) # width, height
sns.barplot(data=spr, y='feature', x='spearman', orient='h')

features = quantitative.tolist()
features.extend(qual_encoded)
print features
spearman(data_train, features)
['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'BedroomAbvGr', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF', 'EnclosedPorch', 'Fireplaces', 'FullBath', 'GarageArea', 'GarageCars', 'GrLivArea', 'HalfBath', 'Id', 'KitchenAbvGr', 'LotArea', 'LowQualFinSF', 'MSSubClass', 'MasVnrArea', 'MiscVal', 'MoSold', 'OpenPorchSF', 'OverallCond', 'OverallQual', 'PoolArea', 'SalePrice', 'ScreenPorch', 'TotRmsAbvGrd', 'TotalBsmtSF', 'WoodDeckSF', 'YearBuilt', 'YearRemodAdd', 'YrSold', 'BldgType_E', 'CentralAir_E', 'Condition1_E', 'Condition2_E', 'Electrical_E', 'ExterCond_E', 'ExterQual_E', 'Exterior1st_E', 'Exterior2nd_E', 'Foundation_E', 'Functional_E', 'Heating_E', 'HeatingQC_E', 'HouseStyle_E', 'KitchenQual_E', 'LandContour_E', 'LandSlope_E', 'LotConfig_E', 'LotShape_E', 'MSZoning_E', 'MasVnrType_E', 'Neighborhood_E', 'PavedDrive_E', 'RoofMatl_E', 'RoofStyle_E', 'SaleCondition_E', 'SaleType_E', 'Street_E', 'Utilities_E']

1
2
print quantitative
print qual_encoded
Index([u'1stFlrSF', u'2ndFlrSF', u'3SsnPorch', u'BedroomAbvGr', u'BsmtFinSF1',
       u'BsmtFinSF2', u'BsmtFullBath', u'BsmtHalfBath', u'BsmtUnfSF',
       u'EnclosedPorch', u'Fireplaces', u'FullBath', u'GarageArea',
       u'GarageCars', u'GrLivArea', u'HalfBath', u'Id', u'KitchenAbvGr',
       u'LotArea', u'LowQualFinSF', u'MSSubClass', u'MasVnrArea', u'MiscVal',
       u'MoSold', u'OpenPorchSF', u'OverallCond', u'OverallQual', u'PoolArea',
       u'SalePrice', u'ScreenPorch', u'TotRmsAbvGrd', u'TotalBsmtSF',
       u'WoodDeckSF', u'YearBuilt', u'YearRemodAdd', u'YrSold'],
      dtype='object')
['BldgType_E', 'CentralAir_E', 'Condition1_E', 'Condition2_E', 'Electrical_E', 'ExterCond_E', 'ExterQual_E', 'Exterior1st_E', 'Exterior2nd_E', 'Foundation_E', 'Functional_E', 'Heating_E', 'HeatingQC_E', 'HouseStyle_E', 'KitchenQual_E', 'LandContour_E', 'LandSlope_E', 'LotConfig_E', 'LotShape_E', 'MSZoning_E', 'MasVnrType_E', 'Neighborhood_E', 'PavedDrive_E', 'RoofMatl_E', 'RoofStyle_E', 'SaleCondition_E', 'SaleType_E', 'Street_E', 'Utilities_E']
1
2
3
4
5
6
7
8
9
10
11
12
13
plt.figure(1)
corr = data_train[quantitative.tolist()].corr()
sns.heatmap(corr)
plt.figure(2)
corr = data_train[qual_encoded+['SalePrice']].corr()
sns.heatmap(corr)
plt.figure(3)
# [31,27]
corr = pd.DataFrame(np.zeros([len(quantitative), len(qual_encoded)+1]), index=quantitative, columns=qual_encoded+['SalePrice'])
for q1 in quantitative:
for q2 in qual_encoded+['SalePrice']:
corr.loc[q1, q2] = data_train[q1].corr(data_train[q2])
sns.heatmap(corr)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb48143fdd0>


1
2
3
4
5
6
7
8
9
10
11
12
13
14
# def pairplot(x, y, **kwargs):
# ax = plt.gca()
# ts = pd.DataFrame({'time': x, 'val': y})
# ts = ts.groupby('time').mean()
# ts.plot(ax=ax)
# plt.xticks(rotation=90)

# f = pd.melt(train, id_vars=['SalePrice'], value_vars=quantitative+qual_encoded)
# g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, size=5)
# g = g.map(pairplot, "value", "SalePrice")

# for column in qual_encoded:
# FeatureEngineerTools.show_corr_int_label(data_train,column,'SalePrice')
# FeatureEngineerTools.show_corr_contin_label(data_train,column,'SalePrice')
---------------------------------------------------------------------------

LinAlgError                               Traceback (most recent call last)

<ipython-input-173-ddff58c709b6> in <module>()
     12 for column in qual_encoded:
     13 #     FeatureEngineerTools.show_corr_int_label(data_train,column,'SalePrice')
---> 14     FeatureEngineerTools.show_corr_contin_label(data_train,column,'SalePrice')


/home/ds/notebooks/kaggle_house_price/MyTools.py in show_corr_contin_label(data, column, label)
    281         label_value_set=data[label].value_counts().keys()
    282         for label_value in label_value_set:
--> 283             data[column][data[label] == label_value].plot(kind='kde')
    284         plt.xlabel("column")  # plots an axis lable
    285         plt.ylabel(u"密度")


/opt/ds/local/lib/python2.7/site-packages/pandas/plotting/_core.pyc in __call__(self, kind, ax, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, label, secondary_y, **kwds)
   2501                            colormap=colormap, table=table, yerr=yerr,
   2502                            xerr=xerr, label=label, secondary_y=secondary_y,
-> 2503                            **kwds)
   2504     __call__.__doc__ = plot_series.__doc__
   2505 


/opt/ds/local/lib/python2.7/site-packages/pandas/plotting/_core.pyc in plot_series(data, kind, ax, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, label, secondary_y, **kwds)
   1925                  yerr=yerr, xerr=xerr,
   1926                  label=label, secondary_y=secondary_y,
-> 1927                  **kwds)
   1928 
   1929 


/opt/ds/local/lib/python2.7/site-packages/pandas/plotting/_core.pyc in _plot(data, x, y, subplots, ax, kind, **kwds)
   1727         plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
   1728 
-> 1729     plot_obj.generate()
   1730     plot_obj.draw()
   1731     return plot_obj.result


/opt/ds/local/lib/python2.7/site-packages/pandas/plotting/_core.pyc in generate(self)
    250         self._compute_plot_data()
    251         self._setup_subplots()
--> 252         self._make_plot()
    253         self._add_table()
    254         self._make_legend()


/opt/ds/local/lib/python2.7/site-packages/pandas/plotting/_core.pyc in _make_plot(self)
   1357             kwds = self._make_plot_keywords(kwds, y)
   1358             artists = self._plot(ax, y, column_num=i,
-> 1359                                  stacking_id=stacking_id, **kwds)
   1360             self._add_legend_handle(artists[0], label, index=i)
   1361 


/opt/ds/local/lib/python2.7/site-packages/pandas/plotting/_core.pyc in _plot(cls, ax, y, style, bw_method, ind, column_num, stacking_id, **kwds)
   1412 
   1413         if LooseVersion(spv) >= '0.11.0':
-> 1414             gkde = gaussian_kde(y, bw_method=bw_method)
   1415         else:
   1416             gkde = gaussian_kde(y)


/opt/ds/local/lib/python2.7/site-packages/scipy/stats/kde.pyc in __init__(self, dataset, bw_method)
    170 
    171         self.d, self.n = self.dataset.shape
--> 172         self.set_bandwidth(bw_method=bw_method)
    173 
    174     def evaluate(self, points):


/opt/ds/local/lib/python2.7/site-packages/scipy/stats/kde.pyc in set_bandwidth(self, bw_method)
    497             raise ValueError(msg)
    498 
--> 499         self._compute_covariance()
    500 
    501     def _compute_covariance(self):


/opt/ds/local/lib/python2.7/site-packages/scipy/stats/kde.pyc in _compute_covariance(self)
    508             self._data_covariance = atleast_2d(np.cov(self.dataset, rowvar=1,
    509                                                bias=False))
--> 510             self._data_inv_cov = linalg.inv(self._data_covariance)
    511 
    512         self.covariance = self._data_covariance * self.factor**2


/opt/ds/local/lib/python2.7/site-packages/scipy/linalg/basic.pyc in inv(a, overwrite_a, check_finite)
    974         inv_a, info = getri(lu, piv, lwork=lwork, overwrite_lu=1)
    975     if info > 0:
--> 976         raise LinAlgError("singular matrix")
    977     if info < 0:
    978         raise ValueError('illegal value in %d-th argument of internal '


LinAlgError: singular matrix

1
2
a = train_data['SalePrice']
a.plot.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7fb4813fb510>

房价二值化后的 差异因子

1
2
3
4
5
6
7
8
9
10
11
12
features = quantitative

standard = data_train[data_train['SalePrice'] < np.log(200000)]
pricey = data_train[data_train['SalePrice'] >= np.log(200000)]

diff = pd.DataFrame()
diff['feature'] = features
diff['difference'] = [(pricey[f].fillna(0.).mean() - standard[f].fillna(0.).mean())/(standard[f].fillna(0.).mean())
for f in features]

sns.barplot(data=diff, x='feature', y='difference')
x=plt.xticks(rotation=90)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# features = quantitative + qual_encoded
# model = TSNE(n_components=2, random_state=0, perplexity=50)
# X = train[features].fillna(0.).values
# tsne = model.fit_transform(X)

# std = StandardScaler()
# s = std.fit_transform(X)
# pca = PCA(n_components=30)
# pca.fit(s)
# pc = pca.transform(s)
# kmeans = KMeans(n_clusters=5)
# kmeans.fit(pc)

# fr = pd.DataFrame({'tsne1': tsne[:,0], 'tsne2': tsne[:, 1], 'cluster': kmeans.labels_})
# sns.lmplot(data=fr, x='tsne1', y='tsne2', hue='cluster', fit_reg=False)
# print(np.sum(pca.explained_variance_ratio_))
0.838490066289

1
2


Your browser is out-of-date!

Update your browser to view this website correctly. Update my browser now

×