前言:总结这篇文章主要为了让自己可再次巩固数据分析的过程,希望以后有机会参加更多的kaggle比赛,本文适合小白
题目链接:Titanic: Machine Learning from Disaster | Kaggle
目的:根据所提供的训练集和测试集,要求运用机器学习工具来预测哪些乘客幸免于悲剧
数据导入和概览
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
os.chdir('/Users/xy/Desktop/Titanic/titanic')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print(type(train),train.shape)#train是DataFrame(891,12)
print(type(test),test.shape)#test是DataFrame(418,12)
print(train.info())

PassengerId 乘客ID
Survived 是否获救(0-未获救 1-获救)
Pclass 舱位等级(1-高等 2-中等 3-低等)
Name 姓名
Sex 性别
Age 年龄
SibSp 父母/孩子人数
Parch 亲戚/朋友人数
Ticket 船票信息
Fare 票价
Cabin 客舱
Embarked 登船港口
这样就知道Age有177条缺失,Cabin有687条缺失,Emarked有2条缺失
数据清洗
data_train = train[['PassengerId','Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']]
print(data_train.head(5))#Name从主观上看和存活率无关,剔除Name

1. 将男女性别用int表示(男-1 女0)
data_train['Sex'] = data_train['Sex'].replace('male',1)
data_train['Sex'] = data_train['Sex'].replace('female',0)
data_train['Sex'].astype(int)
2. Embarked空值填充:众数填充并转换成数值S-0 C-1 Q-2
mod = data_train['Embarked'].mode()
data_train['Embarked'].replace(np.nan,'S',inplace=True)
data_train.loc[data_train['Embarked']=='S','Embarked']=0#.loc 先行后列
data_train.loc[data_train['Embarked']=='C','Embarked']=1
data_train.loc[data_train['Embarked']=='Q','Embarked']=2
3. Canbin缺失值较多处理成yes no
data_train['Cabin'].replace(np.nan,'No',inplace=True)
data_train.loc[data_train['Cabin']!='No','Cabin']='Yes'
4. 构建两个新字段:家庭总人口数、是否单身1-单身 0-不单身
data_train['familysize'] = data_train['Parch']+data_train['SibSp']+1
data_train['isalone'] = np.where(data_train['familysize']==1,1,0)
5. Age空值填充:用不同舱位男女的年龄中位数填充
age_median = data_train.groupby(['Pclass','Sex'])['Age'].median()#求不同舱位和性别的年龄中位数
data_train.set_index(['Pclass','Sex'],inplace=True) #将Pclass和Sex设为索引
data_train.Age.fillna(age_median,inplace=True)#按照索引填充
data_train.reset_index(inplace=True)#重置索引,取消Pclass和Sex索引
数据初步分析:各字段可视化
1. 根据Pclass可视化
p1 = data_train[data_train['Pclass']==1]['Survived'].value_counts().sort_index(ascending=False)#一等舱是否获救占比
p2 = data_train[data_train['Pclass']==2]['Survived'].value_counts().sort_index(ascending=False)#二等舱是否获救占比
p3 = data_train[data_train['Pclass']==3]['Survived'].value_counts().sort_index(ascending=False)#三等舱是否获救占比
#画饼图
fig = plt.figure()
ax1 = fig.add_subplot(1,3,1)
plt.axis('equal')
plt.title('Pclass=1')
plt.pie(p1,
labels = p1.index,
autopct='%.1f%%',
startangle=0,
radius=2,
explode = [0.05,0])
fig.tight_layout(pad=0.5, w_pad=8.0, h_pad=5.0)
ax2 = fig.add_subplot(1,3,2)
plt.axis('equal')
plt.title('Pclass=2')
plt.pie(p2,
labels = p1.index,
autopct='%.1f%%',
startangle=0,
radius=2,
explode = [0.05,0])
ax3 = fig.add_subplot(1,3,3)
plt.axis('equal')
plt.title('Pclass=3')
plt.pie(p3,
labels = p1.index,
autopct='%.1f%%',
startangle=0,
radius=2,
explode = [0.05,0])
以上可知,上等阶级的获救率高于中下等阶级获救率;因此Pclass可以作为特征的预测。

2. 根据Sex可视化
p1 = data_train[data_train['Sex']==1]['Survived'].value_counts().sort_index(ascending=False)
p1.index=['Survived','Died']
p2 = data_train[data_train['Sex']==0]['Survived'].value_counts().sort_index(ascending=False)
p2.index=['Survived','Died']
fig = plt.figure()
ax1 = fig.add_subplot(1,2,1)
plt.axis('equal')
plt.title('Sex=male')
plt.pie(p1,
labels = p1.index,
autopct='%.1f%%',
startangle=0,
radius=2,
explode = [0.05,0])
fig.tight_layout(pad=0.5, w_pad=18.0, h_pad=5.0)
ax2 = fig.add_subplot(1,2,2)
plt.axis('equal')
plt.title('Sex=female')
plt.pie(p2,
labels = p1.index,
autopct='%.1f%%',
startangle=0,
radius=2,
explode = [0.05,0])
以上可知,男性的获救率不到20%,而女性的获救率74%左右,说明性别与是否获救有重大影响

3. 各舱级下各性别的获救情况
fig = plt.figure(figsize=(10,10))
ax1 = fig.add_subplot(2,2,1)
plt.title(u'女性低级舱')
plt.ylabel('人数')
plt.ylim((0,500))
female_low = data_train.Survived[data_train.Sex==0][data_train.Pclass==3].value_counts().sort_index()
female_low.plot(kind='bar',figsize=(8,3),alpha=0.8)
ax1.set_xticklabels([u'未获救',u'获救'],rotation=0)
ax2 = fig.add_subplot(2,2,2)
plt.title(u'女性高级舱')
plt.ylabel('人数')
plt.ylim((0,500))
female_high = data_train.Survived[data_train.Sex==0][data_train.Pclass!=3].value_counts().sort_index()
female_high.plot(kind='bar',figsize=(8,3),alpha=0.8)
ax2.set_xticklabels([u'未获救',u'获救'],rotation=0)
ax3 = fig.add_subplot(2,2,3)
plt.title(u'男性低级舱')
plt.ylabel('人数')
plt.ylim((0,500))
male_low = data_train.Survived[data_train.Sex==1][data_train.Pclass==3].value_counts().sort_index()
male_low.plot(kind='bar',figsize=(8,3),alpha=0.8)
ax3.set_xticklabels([u'未获救',u'获救'],rotation=0)
ax4 = fig.add_subplot(2,2,4)
plt.title(u'男性高级舱')
plt.ylabel('人数')
plt.ylim((0,500))
male_high = data_train.Survived[data_train.Sex==1][data_train.Pclass!=3].value_counts().sort_index()
male_high.plot(kind='bar',figsize=(8,3),alpha=0.8)
ax4.set_xticklabels([u'未获救',u'获救'],rotation=0)
上等阶级男性和女性的生还率均比其他两阶级的高。对于女性来说,上等阶级生还率比中下阶级生还率高。
对于男性来说,上等阶级生还率比中下阶级生还率略微高些;这里说明了阶级可以作为特征预测获救率。

4. 根据Age区间可视化
bins = [0,18,25,35,45,60,100]
data_train['Age'] = data_train['Age'].astype(np.int64)
data_train['Age区间'] = pd.cut(data_train['Age'],bins)
age1 = data_train[data_train['Survived']==1]['Age区间'].value_counts()
age0 = data_train[data_train['Survived']==0]['Age区间'].value_counts()
df_age = pd.DataFrame({'未获救':age0,'获救':age1})
df_age.plot(kind='bar',stacked=True)
未成年、小孩生还率较高,也证实了影片中的一句话:妇女和小孩先走

5. 根据登船港口的获救情况
Embarked0 = data_train.Embarked[data_train['Survived']==0].value_counts()
Embarked1 = data_train.Embarked[data_train['Survived']==1].value_counts()
df_Embarked = pd.DataFrame({'未获救':Embarked0,'获救':Embarked1})
df_Embarked.plot(kind='bar',stacked=True)
plt.title('各登船港口乘客获救情况')
plt.xlabel('港口分类')
plt.ylabel('人数')
c港口获救比例持中,也可以作为特征预测。

6. 根据父母、子女可视化
data_train['familysize'] = data_train['Parch']+data_train['SibSp']+1
data_train['isalone'] = np.where(data_train['familysize']==1,1,0)
带孩子有父母的获救情况较好

7. 根据家庭人数可视化
data_train['familysize'] = data_train['Parch']+data_train['SibSp']+1
data_train['isalone'] = np.where(data_train['familysize']==1,1,0)
家庭人数在2,3,4的获救较好

说明有一个两个兄弟或者配偶比没有兄弟配偶强,毕竟遇到困难都会与自己关系要好的人汇合;但是一旦兄弟或配偶增多,生还几率越低,毕竟一个人出了问题一群人要等着。
8. 根据是否单身可视化
isalone1 = data_train[data_train['Survived']==1]['isalone'].value_counts()
isalone0 = data_train[data_train['Survived']==0]['isalone'].value_counts()
df_isalone = pd.DataFrame({'未获救':isalone0,'获救':isalone1})
df_isalone.plot(kind='bar',stacked=True)
plt.title('是否单身乘客获救情况')
非单身获救比单身获救概率高一些

数据预处理
1. 将特征因子化:pd.get_dummies()
因为在逻辑回归建模时,需要输入的特征都是数值型,所以需要因子化;以Canbin举例:Canbin有yes no两个值,在这里Cabin因子化后Cabin_yes=1,Canbin_no=0
dummies_cabin = pd.get_dummies(data_train['Cabin'],prefix='Cabin')
dummies_Embarked = pd.get_dummies(data_train['Embarked'],prefix='Embarked')
dummies_pclass = pd.get_dummies(data_train['Pclass'],prefix = 'Pclass')
dummies_Sex = pd.get_dummies(data_train['Sex'],prefix='Sex')
data_new = pd.concat([data_train,dummies_cabin,dummies_Embarked,dummies_pclass,dummies_Sex],axis=1)
data_new.drop(['Pclass','Sex','Cabin','Embarked'],axis=1,inplace=True)
2. 将Age和Fare0-1标准化
age_max = data_new['Age'].max()
age_min = data_new['Age'].min()
data_new['Age_n'] = (data_new['Age']-age_min)/(age_max-age_min)
fare_max = data_new['Fare'].max()
fare_min = data_new['Fare'].min()
data_new['Fare_n'] = (data_new['Fare']-fare_min)/(fare_max-fare_min)
对test.csv做以上同样的变化
建立逻辑回归模型
from sklearn import linear_model
train_df = data_new.drop(['PassengerId','Age','Fare','Age区间'],axis=1)#取出属性值
train_np = train_df.as_matrix()#sklearn要求传入的xy均为矩阵
y = train_np[:,0]
x = train_np[:,1:]
model = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)#构建模型model
model.fit(x,y) #学习
预测结果
test_df = test_new.drop(['PassengerId','Age','Fare'],axis=1)
pre = model.predict(test_df)#预测
result = pd.DataFrame({'PassengerId':data_test['PassengerId'],'Survived':pre})
result.to_csv('/Users/xy/Desktop/Titanic/titanic/logical_n1.csv')#导出csv
print(result)
总结:
这里通过性别、港口、家庭人数、是否单身、客舱是否空值、舱位等级、年龄、票价几个属性做预测,属于baseline model基准模型,后续还需要深入优化/(ㄒoㄒ)/~~
有空再来更~~
网友评论