作業中のメモ

よく「計算機」を使って作業をする.知らなかったことを中心にまとめるつもり.

分類問題を解く ~分析編~

どうも,筆者です.

決定木による分類

今回は,決定木を用いて分析を行う.使用するコードを以下に示す.

#!/usr/bin/python
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier

def get_data(target_df):
    X = np.array([target_df['Temperature'].tolist(), target_df['Humidity'].tolist()], dtype=np.float64).T
    y = np.array(target_df['Status'].tolist(), dtype=np.int32)
    return X, y

#
# 決定境界プロット関数
#
def plot_decision_regions(x, y, model, resolution=0.01):
    ## 2変数の入力データの最小値から最大値まで引数resolutionの幅でメッシュを描く
    x1_min, x1_max = x[:, 0].min()-1, x[:, 0].max()+1
    x2_min, x2_max = x[:, 1].min()-1, x[:, 1].max()+1
    x1_mesh, x2_mesh = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                                   np.arange(x2_min, x2_max, resolution))

    ## メッシュデータ全部を学習モデルで分類
    z = model.predict(np.array([x1_mesh.ravel(), x2_mesh.ravel()]).T)
    z = z.reshape(x1_mesh.shape)

    ## メッシュデータと分離クラスを使って決定境界を描いている
    fig, ax = plt.subplots(figsize=(10, 10))
    ax.contourf(x1_mesh, x2_mesh, z, alpha=0.3, cmap='jet')
    ax.set_xlim(x1_mesh.min(), x1_mesh.max())
    ax.set_ylim(x2_mesh.min(), x2_mesh.max())

    ax.scatter(x=x[y==0, 0],y=x[y==0, 1],c='blue', cmap='jet')
    ax.scatter(x=x[y==1, 0],y=x[y==1, 1],c='red', cmap='jet')
    ax.set_xlabel('Temperature')
    ax.set_ylabel('Humidity')
    return fig, ax

if __name__ == '__main__':
    train_df = pd.read_csv('train_data.csv', header=0)
    test_df = pd.read_csv('test_data.csv', header=0)
    train_X, train_y = get_data(train_df)
    test_X, test_y = get_data(test_df)

    for i in np.arange(10):
        depth = i + 1
        tree = DecisionTreeClassifier(max_depth=depth).fit(train_X, train_y)
        fig, ax = plot_decision_regions(train_X, train_y, tree)
        ax.set_title('Tree Depth {}'.format(depth))
        plt.savefig('result_decisionTree/dtree_train_depth{}.png'.format(depth))
        plt.close(fig)
        print('Depth: {:2d}, Accuracy(train, test): ({}, {})'.format(depth, tree.score(train_X, train_y), tree.score(test_X, test_y)))

得られた結果のうち,木の深さが1と10のものを以下に示す.

f:id:mathematicsphysical:20190802003017p:plain
木の深さ: 1

f:id:mathematicsphysical:20190802003036p:plain
木の深さ: 10

過学習気味である.精度を確認する(精度だけでは不十分であるが).

木の深さ 精度(学習用データ) 精度(テスト用データ)
1 0.8206521739130435 0.8152173913043478
2 0.8804347826086957 0.8586956521739131
3 0.8858695652173914 0.8586956521739131
4 0.8913043478260869 0.8913043478260869
5 0.9239130434782609 0.8695652173913043
6 0.9347826086956522 0.8913043478260869
7 0.9619565217391305 0.8804347826086957
8 0.9728260869565217 0.8586956521739131
9 0.9782608695652174 0.8695652173913043
10 0.9836956521739131 0.8695652173913043

一番精度が良いものは,木の深さが4の時である.

ランダムフォレストによる分類

続いて,ランダムフォレストによる分類を行う.使用するコードを以下に示す.

#!/usr/bin/python
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

def get_data(target_df):
    X = np.array([target_df['Temperature'].tolist(), target_df['Humidity'].tolist()], dtype=np.float64).T
    y = np.array(target_df['Status'].tolist(), dtype=np.int32)
    return X, y

#
# 決定境界プロット関数
#
def plot_decision_regions(x, y, model, resolution=0.01):
    ## 2変数の入力データの最小値から最大値まで引数resolutionの幅でメッシュを描く
    x1_min, x1_max = x[:, 0].min()-1, x[:, 0].max()+1
    x2_min, x2_max = x[:, 1].min()-1, x[:, 1].max()+1
    x1_mesh, x2_mesh = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                                   np.arange(x2_min, x2_max, resolution))

    ## メッシュデータ全部を学習モデルで分類
    z = model.predict(np.array([x1_mesh.ravel(), x2_mesh.ravel()]).T)
    z = z.reshape(x1_mesh.shape)

    ## メッシュデータと分離クラスを使って決定境界を描いている
    fig, ax = plt.subplots(figsize=(10, 10))
    ax.contourf(x1_mesh, x2_mesh, z, alpha=0.3, cmap='jet')
    ax.set_xlim(x1_mesh.min(), x1_mesh.max())
    ax.set_ylim(x2_mesh.min(), x2_mesh.max())

    ax.scatter(x=x[y==0, 0],y=x[y==0, 1],c='blue', cmap='jet')
    ax.scatter(x=x[y==1, 0],y=x[y==1, 1],c='red', cmap='jet')
    ax.set_xlabel('Temperature')
    ax.set_ylabel('Humidity')
    return fig, ax

if __name__ == '__main__':
    train_df = pd.read_csv('train_data.csv', header=0)
    test_df = pd.read_csv('test_data.csv', header=0)
    train_X, train_y = get_data(train_df)
    test_X, test_y = get_data(test_df)

    for est_num in [3, 5, 10, 15, 20]:
        forest = RandomForestClassifier(n_estimators=est_num).fit(train_X, train_y)
        fig, ax = plot_decision_regions(train_X, train_y, forest)
        ax.set_title('Tree Estimators {}'.format(est_num))
        plt.savefig('result_randomForest/rforest_train_est{}.png'.format(est_num))
        plt.close(fig)
        print('Estimators: {:2d}, Accuracy(train, test): ({}, {})'.format(est_num, forest.score(train_X, train_y), forest.score(test_X, test_y)))

得られた結果のうち,決定木の数が10と15のものを以下に示す.

f:id:mathematicsphysical:20190802004310p:plain
決定木の数: 10

f:id:mathematicsphysical:20190802004329p:plain
決定木の数: 15

こちらも少し過学習気味である.精度を確認する.

決定木の数 精度(学習用データ) 精度(テスト用データ)
3 0.9456521739130435 0.8913043478260869
5 0.9510869565217391 0.8695652173913043
10 0.9728260869565217 0.8804347826086957
15 0.967391304347826 0.8913043478260869
20 0.9836956521739131 0.8586956521739131

決定木の数が3の時が最も良いが,これは,決定木で推定した場合の精度と変わらない.ランダムフォレストの利点が見いだせない.学習用データとテスト用データの構成が似ているから,これが限界かもしれない.

分類問題を解く ~データの用意と可視化~

どうも,筆者です.

久しぶりの更新である.今回は,気象庁が公開している過去の気象データを用いて,分類問題を解く.まず,分類用データがないため,データの用意をする.

とある都市の気象データを以下に示す.

気温(),湿度(),日照時間(h)
平均,平均,平均
20.9,47,9
20.5,40,14
20.7,47,13
21.5,65,3.3
19.3,75,4.2
21.6,68,8.3
21,74,0
22.6,70,10.5
22.8,79,1.4
24.5,62,13.4
24.3,62,10.8
23.2,75,0.2
21.7,87,0
24,70,7.1
24.2,70,3.7
21.7,84,0
24.1,70,3.8
25.9,54,13.4
22.7,81,0.2
25.1,71,5.3
25.2,74,6.3
24.4,83,0.6
24.8,76,7.6
22.5,79,0.4
23,86,0
23.3,66,8.6
24.2,60,8
20.7,83,0.2
22.3,89,0
25,81,1
27.7,70,11.2
29,67,9.3
27.8,75,3.1
27.9,73,7.3
27.9,68,9.5
25.9,77,3
29.9,63,11.6
27.1,74,0
22.3,94,0
26.7,74,9.4
29.3,51,12.4
28.2,64,0.6
27.1,83,0.5
28.1,69,9
25.5,66,5.5
25.6,72,0.9
27.1,75,0.5
28.2,60,13.5
28.7,53,13.8
28.5,58,13.5
27.7,65,7
26.3,71,3.5
25.1,68,2.1
25.4,70,3.7
26.3,64,4.8
22.7,87,0
24.9,85,1.7
26.8,74,3.2
27.2,69,6.8
28,63,5.8
29.1,61,9.8
27.9,74,8.8
26.5,79,6.4
27.9,73,7.5
29,70,10.1
29.3,69,11.2
29.8,67,10.1
29.5,67,8.7
31,56,12.8
30.1,55,11.9
28.9,51,11
28.5,62,11.7
28.3,66,6.2
28.4,63,5.6
28,73,3.5
27.9,74,2.8
29.8,61,10.2
30.9,62,7.5
29.3,73,5.6
29.6,69,10.5
29.6,68,8.5
29.1,68,9.3
30.9,62,5
30.4,61,7.7
27.8,76,2.3
28.1,71,4.2
28.9,65,9.2
26.6,73,1.4
25.3,84,0.8
26.9,75,6.1
24.3,67,7
26.8,55,12.1
23.8,64,5.3
19.9,57,8.9
19.3,47,12.8
20.2,43,13.8
20.9,41,14
21.1,58,8.9
19.4,78,0
19.3,82,2.6
22.6,57,10.7
23.1,57,8.6
21.7,39,6.2
20.7,44,12.7
20.7,47,13.6
21.9,47,14
23.3,46,13.8
22.8,48,13
22.3,55,8.9
22,65,0
25.6,48,13.7
25,57,7
21,86,0.2
23.5,59,4
24.7,63,12.6
24.4,70,5.4
21.9,90,0
25,60,6.8
24.6,72,1.5
24,81,3.2
24.8,76,0.7
22.9,90,0.5
26.6,79,2.7
27.8,74,5.5
29.1,67,7.2
26.9,78,0.8
26,73,1.7
26.9,61,12.7
28.4,55,13.6
28.9,56,11
27.6,73,0.1
27.5,77,3.6
28.2,73,6.7
28.6,76,3.5
27.9,79,5.6
27.9,74,6.3
28.8,66,9.2
29.7,63,8.6
27.8,72,2.6
29.3,61,10.8
28.2,67,8.8
28.5,69,9.1
28.9,67,7.5
29,68,8
27,81,0.4
28.6,75,3.4
27,89,0.1
29.3,78,4.9
26.3,82,0
28.5,73,5.5
29,72,4.3
27,81,0
29.2,72,8.7
29.8,65,7
27.8,66,3.3
27.7,69,3.3
29.3,69,6.1
29.2,75,3.1
29.9,70,10.2
27.3,83,0.1
27.4,73,6.6
29.7,57,12.5
29.5,67,3.5
27.1,89,0.9
27.9,74,7.3
28,62,4.8
27.5,66,0.6
24.9,85,0
25.9,85,1.2
28,74,4.6
26.7,85,3.7
28.2,70,9.8
28.1,73,6
28.5,72,6.3
28,74,2.3
27.8,77,5.5
30.8,62,10.9
30.2,65,8
27.5,58,2.8
26.8,62,10
27.3,72,1
28.7,74,3.5
28.9,64,9.2
27.3,51,11.5

このデータを下記の条件に合致する場合は1,合致しない場合は0とラベルを振る. 【条件】「気温が24℃以上」かつ「湿度が50%以上」かつ「日照時間が3h以上」

また,ここで利用するデータは「気温」と「湿度」のみとする.気温,湿度,上記の条件を適用した結果を以下に示す.

Temperature,Humidity,Status
20.9,47,0
20.5,40,0
20.7,47,0
21.5,65,0
19.3,75,0
21.6,68,0
21,74,0
22.6,70,0
22.8,79,0
24.5,62,1
24.3,62,1
23.2,75,0
21.7,87,0
24,70,1
24.2,70,1
21.7,84,0
24.1,70,1
25.9,54,1
22.7,81,0
25.1,71,1
25.2,74,1
24.4,83,0
24.8,76,1
22.5,79,0
23,86,0
23.3,66,0
24.2,60,1
20.7,83,0
22.3,89,0
25,81,0
27.7,70,1
29,67,1
27.8,75,1
27.9,73,1
27.9,68,1
25.9,77,1
29.9,63,1
27.1,74,0
22.3,94,0
26.7,74,1
29.3,51,1
28.2,64,0
27.1,83,0
28.1,69,1
25.5,66,1
25.6,72,0
27.1,75,0
28.2,60,1
28.7,53,1
28.5,58,1
27.7,65,1
26.3,71,1
25.1,68,0
25.4,70,1
26.3,64,1
22.7,87,0
24.9,85,0
26.8,74,1
27.2,69,1
28,63,1
29.1,61,1
27.9,74,1
26.5,79,1
27.9,73,1
29,70,1
29.3,69,1
29.8,67,1
29.5,67,1
31,56,1
30.1,55,1
28.9,51,1
28.5,62,1
28.3,66,1
28.4,63,1
28,73,1
27.9,74,0
29.8,61,1
30.9,62,1
29.3,73,1
29.6,69,1
29.6,68,1
29.1,68,1
30.9,62,1
30.4,61,1
27.8,76,0
28.1,71,1
28.9,65,1
26.6,73,0
25.3,84,0
26.9,75,1
24.3,67,1
26.8,55,1
23.8,64,0
19.9,57,0
19.3,47,0
20.2,43,0
20.9,41,0
21.1,58,0
19.4,78,0
19.3,82,0
22.6,57,0
23.1,57,0
21.7,39,0
20.7,44,0
20.7,47,0
21.9,47,0
23.3,46,0
22.8,48,0
22.3,55,0
22,65,0
25.6,48,0
25,57,1
21,86,0
23.5,59,0
24.7,63,1
24.4,70,1
21.9,90,0
25,60,1
24.6,72,0
24,81,1
24.8,76,0
22.9,90,0
26.6,79,0
27.8,74,1
29.1,67,1
26.9,78,0
26,73,0
26.9,61,1
28.4,55,1
28.9,56,1
27.6,73,0
27.5,77,1
28.2,73,1
28.6,76,1
27.9,79,1
27.9,74,1
28.8,66,1
29.7,63,1
27.8,72,0
29.3,61,1
28.2,67,1
28.5,69,1
28.9,67,1
29,68,1
27,81,0
28.6,75,1
27,89,0
29.3,78,1
26.3,82,0
28.5,73,1
29,72,1
27,81,0
29.2,72,1
29.8,65,1
27.8,66,1
27.7,69,1
29.3,69,1
29.2,75,1
29.9,70,1
27.3,83,0
27.4,73,1
29.7,57,1
29.5,67,1
27.1,89,0
27.9,74,1
28,62,1
27.5,66,0
24.9,85,0
25.9,85,0
28,74,1
26.7,85,1
28.2,70,1
28.1,73,1
28.5,72,1
28,74,0
27.8,77,1
30.8,62,1
30.2,65,1
27.5,58,0
26.8,62,1
27.3,72,0
28.7,74,1
28.9,64,1
27.3,51,1

これを「train_data.csv」として保存する.また,テスト用のデータも同様に用意する.下記にとある時期の気象データを示す.

21.3,62,13.4
22.4,61,13.1
22.9,62,10.6
23.9,60,12.9
23.6,64,7
19.4,93,0
22.6,81,5.5
22.6,82,1.4
24.1,69,10.8
21.3,78,0
24.1,76,0.3
23.7,63,11.5
22.7,56,13.8
22.6,62,8.7
21.6,77,0
21.2,60,9.5
21.8,67,4
21,77,4.6
23.1,73,5.4
20.9,96,0
23.3,71,2.5
24.5,51,12.3
20.3,79,0.1
24.3,62,8.5
27.4,52,13
26.6,69,9.2
27.1,72,1.6
26.8,75,2.8
26.4,78,2.3
27.3,75,5.2
27.9,68,10.5
28,66,7.4
27.2,68,5.8
25.7,84,0
24.6,93,0
24.3,93,0
25.5,84,0.2
26.9,73,6
27.7,68,8.8
28.5,65,11
29.7,59,7.1
29.4,58,3.3
29.7,58,8.7
30.8,54,13
31.2,57,12.8
31.5,60,12.2
31,64,10.1
32,61,10.6
31,65,8
30.8,63,11.1
30.5,63,10.6
32.7,51,13.5
33.3,46,13.1
32.2,52,12.6
30.8,63,6.7
29.3,62,5.3
28.4,58,5
29.2,59,3.7
28.4,68,5.8
29.6,58,11.6
29.9,61,7.2
31,58,11.5
33.1,51,12.5
32.8,45,12.9
31.7,52,10.7
32.7,53,9.8
33,49,12.9
29.6,58,3
30.2,52,10.9
31.5,54,11.7
30.8,61,3.6
31.3,61,7.5
28.2,74,3
29.6,70,6.5
30.2,65,8.1
26.9,79,0.4
27.9,74,2.4
26.5,38,13.1
25,50,9.2
25.5,52,10
26.2,65,2.3
28.9,70,6.8
30.4,60,10.2
28.8,67,6.6
28,79,0.2
29.8,70,5.5
31.1,57,12.3
31.1,52,11.8
29.5,58,2.9
30.1,59,9.5
29.7,61,7.9
28.6,69,4

これを下記の条件に従いラベルを振る.合致する場合は1,合致しない場合は0とする. 【条件】「気温が24℃以上」かつ「湿度が50%以上」かつ「日照時間が3h以上」

また,ここで利用するデータは「気温」と「湿度」のみとする.気温,湿度,上記の条件を適用した結果を以下に示す.

Temperature,Humidity,Status
21.3,62,0
22.4,61,0
22.9,62,0
23.9,60,0
23.6,64,0
19.4,93,0
22.6,81,0
22.6,82,0
24.1,69,1
21.3,78,0
24.1,76,0
23.7,63,0
22.7,56,0
22.6,62,0
21.6,77,0
21.2,60,0
21.8,67,0
21,77,0
23.1,73,0
20.9,96,0
23.3,71,0
24.5,51,1
20.3,79,0
24.3,62,1
27.4,52,1
26.6,69,1
27.1,72,0
26.8,75,0
26.4,78,0
27.3,75,1
27.9,68,1
28,66,1
27.2,68,1
25.7,84,0
24.6,93,0
24.3,93,0
25.5,84,0
26.9,73,1
27.7,68,1
28.5,65,1
29.7,59,1
29.4,58,1
29.7,58,1
30.8,54,1
31.2,57,1
31.5,60,1
31,64,1
32,61,1
31,65,1
30.8,63,1
30.5,63,1
32.7,51,1
33.3,46,0
32.2,52,1
30.8,63,1
29.3,62,1
28.4,58,1
29.2,59,1
28.4,68,1
29.6,58,1
29.9,61,1
31,58,1
33.1,51,1
32.8,45,0
31.7,52,1
32.7,53,1
33,49,0
29.6,58,1
30.2,52,1
31.5,54,1
30.8,61,1
31.3,61,1
28.2,74,1
29.6,70,1
30.2,65,1
26.9,79,0
27.9,74,0
26.5,38,0
25,50,1
25.5,52,1
26.2,65,0
28.9,70,1
30.4,60,1
28.8,67,1
28,79,0
29.8,70,1
31.1,57,1
31.1,52,1
29.5,58,0
30.1,59,1
29.7,61,1
28.6,69,1

これを「test_data.csv」として保存する.

下記のスクリプトでデータを可視化する.

#!/usr/bin/python
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def get_data(target_df):
    X = np.array([target_df['Temperature'].tolist(), target_df['Humidity'].tolist()], dtype=np.float64).T
    y = np.array(target_df['Status'].tolist(), dtype=np.int32)
    return X, y

def visualization(X, y, marker='o'):
    fig, ax = plt.subplots(figsize=(10, 10))
    x1_min, x1_max = X[:, 0].min()-1, X[:, 0].max()+1
    x2_min, x2_max = X[:, 1].min()-1, X[:, 1].max()+1
    x1_mesh, x2_mesh = np.meshgrid(np.arange(x1_min, x1_max, 0.1),np.arange(x2_min, x2_max, 0.1))
    ax.scatter(X[y==0, 0], X[y==0, 1], c='blue', marker=marker, cmap='jet')
    ax.scatter(X[y==1, 0], X[y==1, 1], c='red', marker=marker, cmap='jet')
    ax.set_xlim(x1_mesh.min(), x1_mesh.max())
    ax.set_ylim(x2_mesh.min(), x2_mesh.max())
    ax.set_xlabel('Temperature')
    ax.set_ylabel('Humidity')
    return fig, ax

if __name__ == '__main__':
    train_df = pd.read_csv('train_data.csv', header=0)
    test_df = pd.read_csv('test_data.csv', header=0)
    train_X, train_y = get_data(train_df)
    test_X, test_y = get_data(test_df)

    for (X, y, name, marker) in [(train_X, train_y, 'train', 'o'), (test_X, test_y, 'test', '^')]:
        fig, ax = visualization(X, y, marker=marker)
        ax.set_title('{} data'.format(name))
        plt.savefig('visualization_{}_data.png'.format(name))
        plt.close(fig)

結果は下記のようになる.

f:id:mathematicsphysical:20190802001956p:plain
学習用データ

f:id:mathematicsphysical:20190802002019p:plain
テスト用データ

今回はここまでとする.以降,時間を見つけて分類問題を解く.