import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

먼저 bike 데이터셋을 가져와 데이터프레임으로 저장한다.

bike_df = pd.read_csv('/content/drive/MyDrive/KDT/4. 머신러닝과 딥러닝/bike.csv')

bike_df

	datetime	count	holiday	workingday	temp	feels_like	temp_min	temp_max	pressure	humidity	wind_speed	wind_deg	rain_1h	snow_1h	clouds_all	weather_main
0	2018-01-01 0:00	34	1	0	-7.17	-12.73	-8.56	-7.09	1030	53	3.60	310	NaN	NaN	20	Clouds
1	2018-01-01 1:00	49	1	0	-7.35	-13.81	-9.03	-7.15	1030	49	4.60	310	NaN	NaN	1	Clear
2	2018-01-01 2:00	37	1	0	-7.88	-14.05	-9.03	-7.69	1031	52	4.10	310	NaN	NaN	1	Clear
3	2018-01-01 3:00	9	1	0	-8.10	-14.32	-9.36	-7.89	1031	49	4.10	310	NaN	NaN	1	Clear
4	2018-01-01 4:00	12	1	0	-8.19	-14.43	-9.46	-8.09	1031	49	4.10	330	NaN	NaN	1	Clear
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
33374	2021-08-31 19:00	659	0	1	28.78	32.79	26.78	29.94	1007	73	0.45	339	1.00	NaN	90	Rain
33375	2021-08-31 20:00	404	0	1	28.52	32.37	26.34	29.84	1007	74	0.45	347	0.25	NaN	90	Rain
33376	2021-08-31 21:00	259	0	1	28.22	31.85	26.78	29.25	1007	75	0.45	327	NaN	NaN	90	Clouds
33377	2021-08-31 22:00	192	0	1	27.51	30.42	26.43	28.85	1004	76	2.06	60	NaN	NaN	90	Clouds
33378	2021-08-31 23:00	139	0	1	24.48	24.73	23.06	27.85	1009	67	1.54	210	1.00	NaN	90	Rain

33379 rows × 16 columns

bike_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33379 entries, 0 to 33378
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   datetime      33379 non-null  object 
 1   count         33379 non-null  int64  
 2   holiday       33379 non-null  int64  
 3   workingday    33379 non-null  int64  
 4   temp          33379 non-null  float64
 5   feels_like    33379 non-null  float64
 6   temp_min      33379 non-null  float64
 7   temp_max      33379 non-null  float64
 8   pressure      33379 non-null  int64  
 9   humidity      33379 non-null  int64  
 10  wind_speed    33379 non-null  float64
 11  wind_deg      33379 non-null  int64  
 12  rain_1h       6771 non-null   float64
 13  snow_1h       326 non-null    float64
 14  clouds_all    33379 non-null  int64  
 15  weather_main  33379 non-null  object 
dtypes: float64(7), int64(7), object(2)
memory usage: 4.1+ MB

datetime: 날짜
count: 대여 개수
holiday: 휴일
workingday: 근무일
temp: 기온
feels_like: 체감온도
temp_min: 최저온도
temp_max: 최고온도
pressure: 기압
humidity: 습도
wind_speed: 풍속
wind_deg: 풍향
rain_1h: 1시간당 내리는 비의 양
snow_1h: 1시간당 내리는 눈의 양
clouds_all: 구름의 양
weather_main: 날씨

데이터셋에 이상치가 존재하는지 간단하게 확인해본다.

bike_df.describe()

	count	holiday	workingday	temp	feels_like	temp_min	temp_max	pressure	humidity	wind_speed	wind_deg	rain_1h	snow_1h	clouds_all
count	33379.000000	33379.000000	33379.000000	33379.000000	33379.000000	33379.000000	33379.000000	33379.000000	33379.000000	33379.000000	33379.000000	6771.000000	326.000000	33379.000000
mean	333.139788	0.030618	0.681327	15.213087	14.994843	13.532648	16.105542	1017.071602	67.818628	1.829340	174.022919	1.216475	0.641380	63.213997
std	336.519514	0.172283	0.465969	9.908964	11.176487	9.993094	9.984839	7.379420	18.422105	1.703747	113.844334	2.056222	0.571087	30.825936
min	0.000000	0.000000	0.000000	-12.790000	-18.910000	-15.140000	-12.290000	980.000000	14.000000	0.000000	0.000000	0.100000	0.100000	0.000000
25%	59.000000	0.000000	0.000000	6.860000	5.880000	5.230000	7.730000	1012.000000	53.000000	0.450000	62.000000	0.250000	0.250000	40.000000
50%	236.000000	0.000000	1.000000	15.650000	15.020000	13.910000	16.590000	1017.000000	70.000000	1.340000	180.000000	0.530000	0.420000	75.000000
75%	495.000000	0.000000	1.000000	23.800000	24.140000	21.970000	24.390000	1022.000000	84.000000	2.600000	285.000000	1.300000	1.000000	90.000000
max	2038.000000	1.000000	1.000000	36.710000	43.710000	35.380000	38.810000	1044.000000	100.000000	16.980000	360.000000	54.050000	3.300000	100.000000

이상치로 의심되는 count컬럼부터 그래프들로 확인한다.

sns.displot(bike_df['count'])

그래프상 이상치정도의 데이터는 아니므로 그대로 사용한다.

sns.boxplot(bike_df['count'])

sns.scatterplot(x='feels_like', y='count', data=bike_df, alpha=0.3)

sns.scatterplot(x='pressure', y='count', data=bike_df, alpha=0.3)

sns.scatterplot(x='wind_speed', y='count', data=bike_df, alpha=0.3)

In [ ]:

sns.scatterplot(x='wind_deg', y='count', data=bike_df, alpha=0.3)

In [ ]:

bike_df.isna().sum()

Out[ ]:

datetime            0
count               0
holiday             0
workingday          0
temp                0
feels_like          0
temp_min            0
temp_max            0
pressure            0
humidity            0
wind_speed          0
wind_deg            0
rain_1h         26608
snow_1h         33053
clouds_all          0
weather_main        0
dtype: int64

In [ ]:

bike_df = bike_df.fillna(0)

In [ ]:

bike_df.isna().mean()

Out[ ]:

datetime        0.0
count           0.0
holiday         0.0
workingday      0.0
temp            0.0
feels_like      0.0
temp_min        0.0
temp_max        0.0
pressure        0.0
humidity        0.0
wind_speed      0.0
wind_deg        0.0
rain_1h         0.0
snow_1h         0.0
clouds_all      0.0
weather_main    0.0
dtype: float64

In [ ]:

bike_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33379 entries, 0 to 33378
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   datetime      33379 non-null  object 
 1   count         33379 non-null  int64  
 2   holiday       33379 non-null  int64  
 3   workingday    33379 non-null  int64  
 4   temp          33379 non-null  float64
 5   feels_like    33379 non-null  float64
 6   temp_min      33379 non-null  float64
 7   temp_max      33379 non-null  float64
 8   pressure      33379 non-null  int64  
 9   humidity      33379 non-null  int64  
 10  wind_speed    33379 non-null  float64
 11  wind_deg      33379 non-null  int64  
 12  rain_1h       33379 non-null  float64
 13  snow_1h       33379 non-null  float64
 14  clouds_all    33379 non-null  int64  
 15  weather_main  33379 non-null  object 
dtypes: float64(7), int64(7), object(2)
memory usage: 4.1+ MB

In [ ]:

bike_df['datetime'] = pd.to_datetime(bike_df['datetime'])

In [ ]:

bike_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33379 entries, 0 to 33378
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   datetime      33379 non-null  datetime64[ns]
 1   count         33379 non-null  int64         
 2   holiday       33379 non-null  int64         
 3   workingday    33379 non-null  int64         
 4   temp          33379 non-null  float64       
 5   feels_like    33379 non-null  float64       
 6   temp_min      33379 non-null  float64       
 7   temp_max      33379 non-null  float64       
 8   pressure      33379 non-null  int64         
 9   humidity      33379 non-null  int64         
 10  wind_speed    33379 non-null  float64       
 11  wind_deg      33379 non-null  int64         
 12  rain_1h       33379 non-null  float64       
 13  snow_1h       33379 non-null  float64       
 14  clouds_all    33379 non-null  int64         
 15  weather_main  33379 non-null  object        
dtypes: datetime64[ns](1), float64(7), int64(7), object(1)
memory usage: 4.1+ MB

In [ ]:

bike_df.head()

Out[ ]:

	datetime	count	holiday	temp	feels_like	temp_min	temp_max	pressure	humidity	wind_speed	wind_deg	clouds_all	weather_main
0	2018-01-01 00:00:00	34	1	-7.17	-12.73	-8.56	-7.09	1030	53	3.6	310	20	Clouds
1	2018-01-01 01:00:00	49	1	-7.35	-13.81	-9.03	-7.15	1030	49	4.6	310	1	Clear
2	2018-01-01 02:00:00	37	1	-7.88	-14.05	-9.03	-7.69	1031	52	4.1	310	1	Clear
3	2018-01-01 03:00:00	9	1	-8.10	-14.32	-9.36	-7.89	1031	49	4.1	310	1	Clear
4	2018-01-01 04:00:00	12	1	-8.19	-14.43	-9.46	-8.09	1031	49	4.1	330	1	Clear

In [ ]:

bike_df['year'] = bike_df['datetime'].dt.year
bike_df['month'] = bike_df['datetime'].dt.month
bike_df['hour'] = bike_df['datetime'].dt.hour

In [ ]:

bike_df.head()

Out[ ]:

	datetime	count	holiday	temp	feels_like	temp_min	temp_max	pressure	humidity	wind_speed	wind_deg	clouds_all	weather_main	year	month	hour
0	2018-01-01 00:00:00	34	1	-7.17	-12.73	-8.56	-7.09	1030	53	3.6	310	20	Clouds	2018	1	0
1	2018-01-01 01:00:00	49	1	-7.35	-13.81	-9.03	-7.15	1030	49	4.6	310	1	Clear	2018	1	1
2	2018-01-01 02:00:00	37	1	-7.88	-14.05	-9.03	-7.69	1031	52	4.1	310	1	Clear	2018	1	2
3	2018-01-01 03:00:00	9	1	-8.10	-14.32	-9.36	-7.89	1031	49	4.1	310	1	Clear	2018	1	3
4	2018-01-01 04:00:00	12	1	-8.19	-14.43	-9.46	-8.09	1031	49	4.1	330	1	Clear	2018	1	4

In [ ]:

bike_df['date'] = bike_df['datetime'].dt.date

In [ ]:

bike_df.head()

Out[ ]:

	datetime	count	holiday	temp	feels_like	temp_min	temp_max	pressure	humidity	wind_speed	wind_deg	clouds_all	weather_main	year	month	hour	date
0	2018-01-01 00:00:00	34	1	-7.17	-12.73	-8.56	-7.09	1030	53	3.6	310	20	Clouds	2018	1	0	2018-01-01
1	2018-01-01 01:00:00	49	1	-7.35	-13.81	-9.03	-7.15	1030	49	4.6	310	1	Clear	2018	1	1	2018-01-01
2	2018-01-01 02:00:00	37	1	-7.88	-14.05	-9.03	-7.69	1031	52	4.1	310	1	Clear	2018	1	2	2018-01-01
3	2018-01-01 03:00:00	9	1	-8.10	-14.32	-9.36	-7.89	1031	49	4.1	310	1	Clear	2018	1	3	2018-01-01
4	2018-01-01 04:00:00	12	1	-8.19	-14.43	-9.46	-8.09	1031	49	4.1	330	1	Clear	2018	1	4	2018-01-01

In [ ]:

plt.figure(figsize=(14, 4))
sns.lineplot(x='date', y='count', data=bike_df)
plt.xticks(rotation=45)
plt.show()

In [ ]:

bike_df[bike_df['year'] == 2019].groupby('month')['count'].mean()

Out[ ]:

month
1     193.368862
2     221.857718
3     326.564456
4     482.931694
5     438.027848
6     478.480053
7     472.745785
8     481.267366
9     500.862069
10    446.279070
11    307.295393
12    213.148886
Name: count, dtype: float64

In [ ]:

bike_df[bike_df['year'] == 2020].groupby('month')['count'].mean() # 2020년 4월 데이터가 없음

Out[ ]:

month
1     260.445997
2     255.894320
3     217.135241
5     196.581064
6     290.900937
7     299.811688
8     331.528809
9     338.876478
10    293.640777
11    240.507324
12    138.993540
Name: count, dtype: float64

In [ ]:

# covid
# 2020-04-01 이전: precovid
# 2021-04-01 이전: covid
# 이후: postcovid

def covid(date):
    if str(date) < '2020-04-01':
        return 'precovid'
    elif str(date) < '2021-04-01':
        return 'covid'
    else:
        return 'postcovid'

In [ ]:

bike_df['date'] > '2020-04-01'

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-32-04fa4b9c92bf> in <cell line: 1>()
----> 1 bike_df['date'] > '2020-04-01'

/usr/local/lib/python3.10/dist-packages/pandas/core/ops/common.py in new_method(self, other)
     70         other = item_from_zerodim(other)
     71 
---> 72         return method(self, other)
     73 
     74     return new_method

/usr/local/lib/python3.10/dist-packages/pandas/core/arraylike.py in __gt__(self, other)
     56     @unpack_zerodim_and_defer("__gt__")
     57     def __gt__(self, other):
---> 58         return self._cmp_method(other, operator.gt)
     59 
     60     @unpack_zerodim_and_defer("__ge__")

/usr/local/lib/python3.10/dist-packages/pandas/core/series.py in _cmp_method(self, other, op)
   6241 
   6242         with np.errstate(all="ignore"):
-> 6243             res_values = ops.comparison_op(lvalues, rvalues, op)
   6244 
   6245         return self._construct_result(res_values, name=res_name)

/usr/local/lib/python3.10/dist-packages/pandas/core/ops/array_ops.py in comparison_op(left, right, op)
    285 
    286     elif is_object_dtype(lvalues.dtype) or isinstance(rvalues, str):
--> 287         res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)
    288 
    289     else:

/usr/local/lib/python3.10/dist-packages/pandas/core/ops/array_ops.py in comp_method_OBJECT_ARRAY(op, x, y)
     73         result = libops.vec_compare(x.ravel(), y.ravel(), op)
     74     else:
---> 75         result = libops.scalar_compare(x.ravel(), y, op)
     76     return result.reshape(x.shape)
     77 

/usr/local/lib/python3.10/dist-packages/pandas/_libs/ops.pyx in pandas._libs.ops.scalar_compare()

TypeError: '>' not supported between instances of 'datetime.date' and 'str'

In [ ]:

covid(bike_df['date'])

Out[ ]:

'precovid'

In [ ]:

bike_df['date'].apply(covid)

Out[ ]:

0         precovid
1         precovid
2         precovid
3         precovid
4         precovid
           ...    
33374    postcovid
33375    postcovid
33376    postcovid
33377    postcovid
33378    postcovid
Name: date, Length: 33379, dtype: object

In [ ]:

bike_df['covid'] = bike_df['date'].apply(lambda date: 'precovid' if str(date) < '2020-04-01' else 'covid' if str(date) < '2021-04-01' else 'postcovid')

In [ ]:

bike_df.head()

Out[ ]:

	datetime	count	holiday	temp	feels_like	temp_min	temp_max	pressure	humidity	...	wind_deg	clouds_all	weather_main	year	month	hour	date	covid
0	2018-01-01 00:00:00	34	1	-7.17	-12.73	-8.56	-7.09	1030	53	...	310	20	Clouds	2018	1	0	2018-01-01	precovid
1	2018-01-01 01:00:00	49	1	-7.35	-13.81	-9.03	-7.15	1030	49	...	310	1	Clear	2018	1	1	2018-01-01	precovid
2	2018-01-01 02:00:00	37	1	-7.88	-14.05	-9.03	-7.69	1031	52	...	310	1	Clear	2018	1	2	2018-01-01	precovid
3	2018-01-01 03:00:00	9	1	-8.10	-14.32	-9.36	-7.89	1031	49	...	310	1	Clear	2018	1	3	2018-01-01	precovid
4	2018-01-01 04:00:00	12	1	-8.19	-14.43	-9.46	-8.09	1031	49	...	330	1	Clear	2018	1	4	2018-01-01	precovid

5 rows × 21 columns

In [ ]:

# season
# 3월 ~ 5월: spring
# 6월 ~ 8월: summer
# 9월 ~ 11월: fall
# 12월 ~ 2월: winter
bike_df['season'] = bike_df['month'].apply(lambda x: 'winter' if x == 12 else 'fall' if x >= 9 else 'summer' if x >= 6 else 'spring' if x >= 3 else 'winter')
# bike_df['season'] = bike_df['month'].apply(lambda month: 'spring' if month >= 3 and month <= 5 else 'summer' if month >= 6 and month <= 8 else 'fall' if month >= 9 and month <= 11 else 'winter')

In [ ]:

bike_df[['month', 'season']]

Out[ ]:

	month	season
0	1	winter
1	1	winter
2	1	winter
3	1	winter
4	1	winter
...	...	...
33374	8	summer
33375	8	summer
33376	8	summer
33377	8	summer
33378	8	summer

33379 rows × 2 columns

In [ ]:

bike_df['day_night'] = bike_df['hour'].apply(lambda x: 'night' if x >= 21 else 'late evening' if x >= 19 else 'early evening' if x >= 17 else 'late afternoon' if x >= 16 else 'early afternoon' if x >= 13 else 'late morning' if x >= 11 else 'early morning' if x >= 5 else 'night')

In [ ]:

bike_df.head()

Out[ ]:

	datetime	count	holiday	temp	feels_like	temp_min	temp_max	pressure	humidity	...	clouds_all	weather_main	year	month	hour	date	covid	season	day_night
0	2018-01-01 00:00:00	34	1	-7.17	-12.73	-8.56	-7.09	1030	53	...	20	Clouds	2018	1	0	2018-01-01	precovid	winter	night
1	2018-01-01 01:00:00	49	1	-7.35	-13.81	-9.03	-7.15	1030	49	...	1	Clear	2018	1	1	2018-01-01	precovid	winter	night
2	2018-01-01 02:00:00	37	1	-7.88	-14.05	-9.03	-7.69	1031	52	...	1	Clear	2018	1	2	2018-01-01	precovid	winter	night
3	2018-01-01 03:00:00	9	1	-8.10	-14.32	-9.36	-7.89	1031	49	...	1	Clear	2018	1	3	2018-01-01	precovid	winter	night
4	2018-01-01 04:00:00	12	1	-8.19	-14.43	-9.46	-8.09	1031	49	...	1	Clear	2018	1	4	2018-01-01	precovid	winter	night

5 rows × 23 columns

In [ ]:

bike_df.drop(['datetime', 'month', 'hour', 'date'], axis=1, inplace=True)

In [ ]:

bike_df.head()

Out[ ]:

	count	holiday	temp	feels_like	temp_min	temp_max	pressure	humidity	wind_speed	wind_deg	clouds_all	weather_main	year	covid	season	day_night
0	34	1	-7.17	-12.73	-8.56	-7.09	1030	53	3.6	310	20	Clouds	2018	precovid	winter	night
1	49	1	-7.35	-13.81	-9.03	-7.15	1030	49	4.6	310	1	Clear	2018	precovid	winter	night
2	37	1	-7.88	-14.05	-9.03	-7.69	1031	52	4.1	310	1	Clear	2018	precovid	winter	night
3	9	1	-8.10	-14.32	-9.36	-7.89	1031	49	4.1	310	1	Clear	2018	precovid	winter	night
4	12	1	-8.19	-14.43	-9.46	-8.09	1031	49	4.1	330	1	Clear	2018	precovid	winter	night

In [ ]:

bike_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33379 entries, 0 to 33378
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   count         33379 non-null  int64  
 1   holiday       33379 non-null  int64  
 2   workingday    33379 non-null  int64  
 3   temp          33379 non-null  float64
 4   feels_like    33379 non-null  float64
 5   temp_min      33379 non-null  float64
 6   temp_max      33379 non-null  float64
 7   pressure      33379 non-null  int64  
 8   humidity      33379 non-null  int64  
 9   wind_speed    33379 non-null  float64
 10  wind_deg      33379 non-null  int64  
 11  rain_1h       33379 non-null  float64
 12  snow_1h       33379 non-null  float64
 13  clouds_all    33379 non-null  int64  
 14  weather_main  33379 non-null  object 
 15  year          33379 non-null  int64  
 16  covid         33379 non-null  object 
 17  season        33379 non-null  object 
 18  day_night     33379 non-null  object 
dtypes: float64(7), int64(8), object(4)
memory usage: 4.8+ MB

In [ ]:

for i in ['weather_main', 'covid', 'season', 'day_night']:
    print(i, bike_df[i].nunique())

weather_main 11
covid 3
season 4
day_night 7

In [ ]:

bike_df['weather_main'].unique()

Out[ ]:

array(['Clouds', 'Clear', 'Snow', 'Mist', 'Rain', 'Fog', 'Drizzle',
       'Haze', 'Thunderstorm', 'Smoke', 'Squall'], dtype=object)

In [ ]:

plt.figure(figsize=(10, 5))
sns.boxplot(x='weather_main', y='count', data=bike_df)

Out[ ]:

<Axes: xlabel='weather_main', ylabel='count'>

In [ ]:

bike_df = pd.get_dummies(bike_df, columns=['weather_main', 'covid', 'season', 'day_night'])

In [ ]:

bike_df.head()

Out[ ]:

	count	holiday	temp	feels_like	temp_min	temp_max	pressure	humidity	wind_speed	...	season_winter	day_night_night
0	34	1	-7.17	-12.73	-8.56	-7.09	1030	53	3.6	...	1	1
1	49	1	-7.35	-13.81	-9.03	-7.15	1030	49	4.6	...	1	1
2	37	1	-7.88	-14.05	-9.03	-7.69	1031	52	4.1	...	1	1
3	9	1	-8.10	-14.32	-9.36	-7.89	1031	49	4.1	...	1	1
4	12	1	-8.19	-14.43	-9.46	-8.09	1031	49	4.1	...	1	1

5 rows × 40 columns

In [ ]:

pd.set_option('display.max_columns', 45)

In [ ]:

bike_df.head()

Out[ ]:

	count	holiday	temp	feels_like	temp_min	temp_max	pressure	humidity	wind_speed	wind_deg	clouds_all	year	weather_main_Clear	weather_main_Clouds	covid_precovid	season_winter	day_night_night
0	34	1	-7.17	-12.73	-8.56	-7.09	1030	53	3.6	310	20	2018	0	1	1	1	1
1	49	1	-7.35	-13.81	-9.03	-7.15	1030	49	4.6	310	1	2018	1	0	1	1	1
2	37	1	-7.88	-14.05	-9.03	-7.69	1031	52	4.1	310	1	2018	1	0	1	1	1
3	9	1	-8.10	-14.32	-9.36	-7.89	1031	49	4.1	310	1	2018	1	0	1	1	1
4	12	1	-8.19	-14.43	-9.46	-8.09	1031	49	4.1	330	1	2018	1	0	1	1	1

In [ ]:

from sklearn.model_selection import train_test_split

In [ ]:

X_train, X_test, y_train, y_test = train_test_split(bike_df.drop('count', axis=1), bike_df['count'], test_size=0.2, random_state=10)

2. 의사 결정 나무(Decision Tree)

데이터를 분석하여 그 사이에 존재하는 패턴을 예측 가능한 규칙들의 조합으로 나타내며, 그 모양이 '나무'와 같다고 해서 의사 결정 나무라고 부름
분류(Classification)과 회귀(Regression) 모두 가능
지니계수(Gini Index): 0에 가까울수록 클래스에 속한 불순도가 낮음
엔트로피(Entropy): 결정을 내릴만한 충분한 정보가 데이터에 없다고 보는 것. (0에 가까울수록 결정을 내릴만한 충분한 정보가 있다)
오버피팅(과적합): 훈련데이터에서는 정확하나 테스트데이터에서는 성과가 나쁜 현상을 말함. 훈련 데이터가 적거나 노이즈가 있을 때 또는 알고리즘 자체가 나쁠 때 발생. 의사 결정 나무에서는 나무의 가지가 너무 많거나 크기가 클 때 발생
- 의사 결정 나무에서 오버피팅을 피하는 방법
  - 사전 가지치기: 나무가 다 자라기 전에 알고리즘을 멈추는 방법
  - 사후 가지치기: 의사 결정 나무를 끝까지 돌린 후 밑에서부터 가지를 쳐나가는 방

from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=10)

dt.fit(X_train, y_train)

In [ ]:

pred1 = dt.predict(X_test)

In [ ]:

sns.scatterplot(x=y_test, y=pred1)

Out[ ]:

<Axes: xlabel='count'>

In [ ]:

from sklearn.metrics import mean_squared_error

In [ ]:

mean_squared_error(y_test, pred1, squared=False)

Out[ ]:

228.42843328100884

3. 선형 회귀 vs 의사 결정 나무

In [ ]:

from sklearn.linear_model import LinearRegression

In [ ]:

lr = LinearRegression()

In [ ]:

lr.fit(X_train, y_train)

In [ ]:

pred2 = lr.predict(X_test)

In [ ]:

sns.scatterplot(x=y_test, y=pred2)

Out[ ]:

<Axes: xlabel='count'>

In [ ]:

mean_squared_error(y_test, pred2, squared=False)

Out[ ]:

228.26128192004947

In [ ]:

# 의사 결정 나무 RMSE: 228.42843328100884
# 선형 회귀 RMSE: 228.26128192004947
228.42843328100884 - 228.26128192004947

Out[ ]:

0.167151360959366

In [ ]:

# 하이퍼 파라미터 적용
dt = DecisionTreeRegressor(random_state=10, max_depth=50, min_samples_leaf=30)

In [ ]:

dt.fit(X_train, y_train)

In [ ]:

pred3 = dt.predict(X_test)

In [ ]:

mean_squared_error(y_test, pred3, squared=False)

Out[ ]:

187.3015148952268

In [ ]:

# 의사 결정 나무 RMSE: 228.42843328100884
# 선형 회귀 RMSE: 228.26128192004947
# 의사 결정 나무 파라미터 튜닝 RMSE: 187.3015148952268
from sklearn.tree import plot_tree

In [ ]:

plt.figure(figsize=(24, 12))
plot_tree(dt, max_depth=5, fontsize=12)
plt.show()

In [ ]:

plt.figure(figsize=(24, 12))
plot_tree(dt, max_depth=5, fontsize=12, feature_names=X_train.columns)
plt.show()

(Python) 서포트 벡터 머신 (0)	2023.06.14
(Python) 로지스틱 회귀 (0)	2023.06.14
(Python) 선형 회귀 (0)	2023.06.12
(Python) 타이타닉 데이터셋 (0)	2023.06.12
(Python) 아이리스 데이터셋 (0)	2023.06.12

흰둥이는 코드를 짤 때 짖어 (왈!왈!왈!왈!왈!왈!왈!왈!왈!왈!왈!)

흰둥이는 코드를 짤 때 짖어 (왈!왈!왈!왈!왈!왈!왈!왈!왈!왈!왈!)

(Python) 의사 결정 나무 본문

(Python) 의사 결정 나무

2. 의사 결정 나무(Decision Tree)

3. 선형 회귀 vs 의사 결정 나무

'파이썬 머신러닝, 딥러닝' 카테고리의 다른 글

티스토리툴바

« 2025/04 »
일	월	화	수	목	금	토
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30