-
Notifications
You must be signed in to change notification settings - Fork 23
Expand file tree
/
Copy pathonehot.py
More file actions
37 lines (31 loc) · 1.43 KB
/
onehot.py
File metadata and controls
37 lines (31 loc) · 1.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
def pandasMain():
data = pd.read_csv(
'adult.data', header=None, index_col=False,
names=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
'marital-status', 'occupation', 'relationship', 'race', 'gender',
'capital-gain', 'capitaal-loss', 'hours-per-week', 'native-country',
'income'])
# 选取需要的数据
data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income']]
# 查看特定数据出现各类值次数
print(data.gender.value_counts())
# 对数据进行one-hot处理
print("Original features:\n", list(data.columns), "\n")
data_dummies = pd.get_dummies(data)
print("Features after get_dummies:\n", list(data_dummies.columns))
# 选择需要训练的数据
features = data_dummies.loc[:, 'age':'occupation_ Transport-moving']
x = features.values
y = data_dummies['income_ >50K'].values
print("x.shape: {} y.shape: {}".format(x.shape, y.shape))
# 进行模型训练
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
print('Test score: {:.2f}'.format(logreg.score(x_test, y_test)))
if __name__ == '__main__':
# 基于pandas的one-hot
pandasMain()