๋น ๋ฐ์ดํฐ ๋ถ์ ๊ธฐ์ฌ ์ค๊ธฐ - ์์ ํ 2 ์ ๋ฆฌ๋ณธ
๐จ ๋ชจ๋ ์ฝ๋๋ ํ์ด์ฌ ๊ธฐ์ค์ ๋๋ค.
๐ ์์ ํ 2 ๋ฌธ์ ํ์ด ์์
1 ๋จ๊ณ : ๋ฐ์ดํฐ ํ์
ํ๊ธฐ, ๋ฐ์ดํฐ ํ์ต ๋ชจ๋ธ ์ ์ (๋ถ๋ฅ, ํ๊ท)
2 ๋จ๊ณ : ๋ฐ์ดํฐ ์ ๋ฆฌ : ๋ถํ์ํ ๋ฐ์ดํฐ, ๊ฒฐ์ธก์น, ๋ฌธ์ํ ๋ฑ ์ ์ฒ๋ฆฌ๊ฐ ํ์ํ ์ปฌ๋ผ ์ฒดํฌํ๊ธฐ
3 ๋จ๊ณ : ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌํ๊ธฐ
4 ๋จ๊ณ : ๋ฐ์ดํฐ ๋ถ๋ฆฌํ๊ธฐ, ๋ชจ๋ธ ํ์ต ์ ์ต์ ์ ํ์ดํผํ๋ผ๋ฏธํฐ๋ฅผ ์ฐพ๊ธฐ ์ํด ๋ฐ์ดํฐ๋ฅผ ๋ถ๋ฆฌํ์ฌ ํ ์คํธ
- ๋ง์ฝ xtrain, xtest, ytrain์ ํํ๊ฐ ์๋ train, test๋ง ์ฃผ์ด์ง๋ค๋ฉด ํ์๋ก ๋ฐ์ดํฐ ๋ถ๋ฆฌํด์ผํจ
5 ๋จ๊ณ : ๋ฐ์ดํฐ ํ์ต : ๋ฌธ์ ์ ๋ง๋ ๋ฐ์ดํฐ ๋ชจ๋ธ ์ ํํ์ฌ ํ์ต์ํค๊ธฐ
6 ๋จ๊ณ : ์ ์ถํ๊ธฐ ์ ๋ฐ์ดํฐ ํ์ธํ๊ณ ์ ์ถํ๊ธฐ
โ๏ธ ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ
1 ) ๋ถํ์ํ ๋ฐ์ดํฐ ์ญ์
drop ํจ์๋ฅผ ์ฌ์ฉํ์ฌ ๋ฐ์ดํฐ ์ญ์
๋ง์ฝ ๋ฐ์ดํฐ ์ถ์ถ์ด ํ์ํ๋ค๋ฉด pop์ ํตํด ๋ฐ๋ก ์ ์ฅ ( ์ด๋ ์๋ณธ ๋ฐ์ดํฐ์์๋ ์ฌ๋ผ์ง)
x_train = x_train.drop(columns=['id'])
y_train = y_train.drop(columns=['id'])
x_test_id = x_test.pop('id')
2 ) ๊ฒฐ์ธก์น ์ฒ๋ฆฌ
isnull()๋ก ํ์ธํ๊ณ , fillna()๋ฅผ ํตํด ๊ฐ ์ฑ์๋ฃ๊ธฐ
mean(), mode()๋ฑ ๋ฌธ์ ์ ๊ฒฐ์ธก์น๋ฅผ ์ด๋ป๊ฒ ์ฒ๋ฆฌํด์ผํ๋์ง ์๋์์์ผ๋ฉด describe()๋ฅผ ํตํด ๊ฒฐ์ธก์น๋ฅผ ์ ํ๋ฉด๋๋ค.
์ ๋ชจ๋ฅด๋ฉด ์ผ๋จ ํ๊ท ์๋๋ฉด ์ต๋น๊ฐ์ด๋ฉด ๋๋ค.
์ต๋น๊ฐ์ผ๋ก ์ฒ๋ฆฌ ํ ์ value_counts()๋ฅผ ํตํด ํ์ธํ ๊ฒ, ๊ฒฐ์ธก์น๊ฐ ์ต๋น๊ฐ์ผ ์ ์์
# print(x_test['ํ๋ถ๊ธ์ก'].describe())
x_train['ํ๋ถ๊ธ์ก'] = x_train['ํ๋ถ๊ธ์ก'].fillna(x_train['ํ๋ถ๊ธ์ก'].mean())
x_test['ํ๋ถ๊ธ์ก'] = x_test['ํ๋ถ๊ธ์ก'].fillna(x_test['ํ๋ถ๊ธ์ก'].mean())
train['AnnualIncome'] = train['AnnualIncome'].fillna(train['AnnualIncome'].min())
test['AnnualIncome'] = test['AnnualIncome'].fillna(test['AnnualIncome'].min())
3 ) ๋ฌธ์ํ ์ฒ๋ฆฌ โญ๏ธ ( 7ํ ์์ ํ 2 ๋ฌธ์ํ ์ฒ๋ฆฌ ๅฟ )
object : ์ํซ์ธ์ฝ๋ฉ, ๋ผ๋ฒจ ์ธ์ฝ๋ฉ ๋ฑ์ผ๋ก ๋ฐ์ดํฐ๋ฅผ ๋ณํํด์ผํจ. ์ฒ๋ฆฌํ๊ธฐ ์ ๋งคํ ๊ฒฝ์ฐ ์ญ์ ๋ฅผ ํ ์ ์์
์ํซ์ธ์ฝ๋ฉ : pd.get_dummies()
๋ผ๋ฒจ์ธ์ฝ๋ฉ : from sklearn.preprocessing import LabelEncoder
col = ['GraduateOrNot', 'FrequentFlyer', 'EverTravelledAbroad']
train = pd.get_dummies(data = train, columns = col)
test = pd.get_dummies(data = test, columns = col)
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
train['Employment Type'] = encoder.fit_transform(train['Employment Type'])
test['Employment Type'] = encoder.fit_transform(test['Employment Type'])
4 ) ์ค์ผ์ผ๋ง
describe : ํด๋น ํจ์๋ฅผ ํตํด ์ค์ผ์ผ๋ง ๋ฐฉ๋ฒ์ ์ ํํ ์ ์์ผ๋ฉฐ ๊ฒฐ์ธก์น๋ฅผ ์ด๋ค ๊ฐ์ผ๋ก ์ฌ์ฉํ ์ง ์ ํํ ์ ์๋ค.
StandardScaler, RobustScaler, MinMaxScaler ๋ฑ์ด ์์
์ด์์น์ ์ํฅ์ ์ ๋ฐ์ง ์๋ RobustScaler์ ์ฃผ๋ก ์ฌ์ฉํ ์ ์์, ๋ง์ฝ ์ต์, ์ต๋๊ฐ์ด ๋๋ฌด ํฐ ์ฐจ์ด๋ฅผ ๋ณด์ธ๋ค๋ฉด MinMaxScaler๋ฅผ ์ถ์ฒํ๋ค.
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
col = ['GRE Score','TOEFL Score']
xtrain[col] = scaler.fit_transform(xtrain[col])
xtest[col] = scaler.transform(xtest[col])
โ๏ธ ๋ฐ์ดํฐ ํ์ต ๋ฐ ์ ์ถ
1 ) ๋ฐ์ดํฐ ๋ถ๋ฆฌ
๋ฐ์ดํฐ๋ฅผ ๋ถ๋ฆฌํ์ฌ ๋ชจ๋ธ์ ํ์ต์ํจ๋ค.
๋ฐ์ดํฐ ํ์ต์ ํตํด n_estimators ์ max_depth์ ๊ฐ์ ๋ณ๊ฒฝํ๋ฉฐ ํ์ต ๊ฒฐ๊ณผ๋ฅผ ๋น๊ตํด ์ต์ ์ ํ์ดํผ ํ๋ผ๋ฏธํฐ ๊ฐ์ ์ฐพ๋๋ค. ์ฌ๊ธฐ์ ํ์ต๊ฒฐ๊ณผ๋ ์ฑ์ ๋ฐฉ์์ ๋ฐ๋ผ ๊ฐ์ ํ์ธํ๋ฉด ๋๋ค.
( from sklearn.metrics import roc_auc_score / import accuracy_score / import r2_score ๋ฑ )
# ๋ฐ์ดํฐ ๋ถ๋ฆฌ
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(train, train_travel, test_size = 0.2)
# ๋ฐ์ดํฐ ํ์ต
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 50, max_depth = 8) # ํ์ดํผํ๋ผ๋ฏธํฐ
model.fit(xtrain, ytrain['Reached.on.Time_Y.N'])
model_pred = model.predict_proba(xtest)
# ํ์ต ๊ฒฐ๊ณผ
from sklearn.metrics import roc_auc_score
print(roc_auc_score(ytest['Reached.on.Time_Y.N'],model_pred[:,1]))
2 ) ๋ฐ์ดํฐ ํ์ต ๋ชจ๋ธ
๋ถ๋ฅ : RandomForestClassifier, XGBClassifier
ํ๊ท : RandomForestRegressor, XGBRegressor
๐จ ๋ง์ฝ ํ์ต๊ฒฐ๊ณผ๋ฅผ ํ์ธํ ๋ ์ฌ์ฉํด์ผํ๋ y_test ๊ฐ์ด ์กด์ฌํ์ง์๋๋ค๋ฉด ์ต์ ์ ํ์ดํผ ํ๋ผ๋ฏธํฐ๊ฐ์ ์ฐพ์ง ์๊ณ ' model = RandomForestRegressor() ' ๊ธฐ๋ณธ์ผ๋ก ์ฌ์ฉํ๋ฉด ๋๋ค.
# ๋ฐ์ดํฐ ๋ถ๋ฆฌ
# from sklearn.model_selection import train_test_split
# x_train1, x_test1, y_train1, y_test1 = train_test_split(x_train, y_train, test_size = 0.2)
# ํ์ดํผ ํ๋ผ๋ฏธํฐ ๊ฐ ์ฐพ๊ธฐ ( ์ด๋๋ ๋ฐ์ดํฐ ๋ถ๋ฆฌํ x_test1์ ์ฌ์ฉํจ)
# from sklearn.ensemble import RandomForestRegressor
# model3 = RandomForestRegressor(n_estimators = 100, max_depth=5)
# model3.fit(x_train1, np.ravel(y_train1))
# model_pred3 = model3.predict(x_test1)
# from sklearn.metrics import mean_squared_error
# rmse2 = np.sqrt(mean_squared_error(y_test1, model_pred3))
# print("Mean Squared Error:", rmse2)
# ์ค์ ์ ์ถ์ ์ํด ๋ฐ์ดํฐ ํ์ต ์ํค๊ธฐ ( ์ด๋๋ ์ค์ ์ ์ถํ x_test๋ฅผ ์ฌ์ฉํจ)
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 100, max_depth=5)
model.fit(x_train, np.ravel(y_train))
model_pred = model.predict(x_test)
3 ) ๊ฒฐ๊ณผ ํ์ธํ์ฌ ์ ์ถ
result = pd.DataFrame({'enrollee_id' : xtest_id, 'target' : model_pred})
print(result)
pd.DataFrame({'enrollee_id' : xtest_id, 'target' : model_pred}).to_csv('result.csv', index=False)
๐ ๋ฌธ์ ํ์ด
1. ๋ถ๋ฅ ( RandomForestClassifier )
# ๋ฐ์ดํฐ ํ์
# print(train.info()) # ์ญ์ : ID / pop : Segmentation
# print(test.info()) # pop : ID
train = train.drop(columns = ['ID'])
train_seg = train.pop('Segmentation')
test_id = test.pop('ID')
# object : Gender / Ever_Married /Graduated /Profession /Spending_Score /Var_1
# ์ํซ์ธ์ฝ๋ฉ : Gender, Ever_Married, Graduated, Spending_Score
# ๋ผ๋ฒจ์ธ์ฝ๋ฉ : Profession, Var_1
col = ['Gender', 'Ever_Married', 'Graduated', 'Spending_Score']
train = pd.get_dummies(data=train, columns=col)
test = pd.get_dummies(data=test, columns=col)
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
train['Profession'] = encoder.fit_transform(train['Profession'])
test['Profession'] = encoder.transform(test['Profession'])
train['Var_1'] = encoder.fit_transform(train['Var_1'])
test['Var_1'] = encoder.transform(test['Var_1'])
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(train, train_seg, test_size = 0.2)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(xtrain, ytrain)
model_pred = model.predict(test)
result = pd.DataFrame({'ID': test_id, 'Segmentation': model_pred})
print(result)
pd.DataFrame({'ID': test_id, 'Segmentation': model_pred}).to_csv('submission.csv', index = False)
2. ํ๊ท ( RandomForestRegressor ) โญ๏ธ ( 7ํ ์์ ํ 2 ํ๊ท๋ฌธ์ ์ ์ถ)
# ๋ณดํ ์๊ธ??? ํ๊ท ๋ชจ๋ธ RandomForestRegressor
# Insurance Prediction (Regression)
# ์ค๋ ์ ํฌ๋ ์๋ฃ๋ณดํ ๋ฐ์ดํฐ๋ฅผ ํ์ฉํด ํ ์ฌ๋์ด ๋ณดํ๋ฃ๋ฅผ ์ผ๋ง๋ ๋ผ์ง๋ฅผ ์์ธกํ๋ ํ๊ท ๋ฌธ์ ๋ฅผ ๋ค๋ค๋ณด๊ฒ ์ต๋๋ค.
# print(x_train.info()) # ๋ถํ์์ปฌ๋ผ : ๊ฒฐ์ธก์น : ์์ / object : sex, smoker, region
# print(x_test.info()) # ๊ฒฐ์ธก์น : ์์ / object : sex, smoker, region
# print(y_train.info())
# ๊ฒฐ์ธก์น์๋์ง ํ์ธ
# print(x_train.isnull().sum())
# print(x_test.isnull().sum())
# print(y_train.isnull().sum())
# ๋ถํ์ ์ปฌ๋ผ ์ญ์
x_train = x_train.drop(columns=['id'])
x_test_id = x_test.pop('id')
y_train = y_train.drop(columns=['id'])
# ์ํซ ์ธ์ฝ๋ฉ : sex, smoker / ๋ผ๋ฒจ : region
ocol = ['sex', 'smoker']
x_train = pd.get_dummies(data = x_train, columns = ocol)
x_test = pd.get_dummies(data = x_test, columns = ocol)
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
x_train['region'] = encoder.fit_transform(x_train['region'])
x_test['region'] = encoder.transform(x_test['region'])
# ์ค์ผ์ผ๋ง
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
rcol = ['age', 'bmi']
x_train[rcol] = scaler.fit_transform(x_train[rcol])
x_test[rcol] = scaler.transform(x_test[rcol])
# ๋ฐ์ดํฐ ๋ถ๋ฆฌ
# from sklearn.model_selection import train_test_split
# x_train1, x_test1, y_train1, y_test1 = train_test_split(x_train, y_train, test_size = 0.2)
# ํ์ดํผ ํ๋ผ๋ฏธํฐ ๊ฐ ์ฐพ๊ธฐ ( ์ด๋๋ ๋ฐ์ดํฐ ๋ถ๋ฆฌํ x_test1์ ์ฌ์ฉํจ)
# from sklearn.ensemble import RandomForestRegressor
# model3 = RandomForestRegressor(n_estimators = 100, max_depth=5)
# model3.fit(x_train1, np.ravel(y_train1))
# model_pred3 = model3.predict(x_test1)
# from sklearn.metrics import mean_squared_error
# rmse2 = np.sqrt(mean_squared_error(y_test1, model_pred3))
# print("Mean Squared Error:", rmse2)
# ์ค์ ์ ์ถ์ ์ํด ๋ฐ์ดํฐ ํ์ต ์ํค๊ธฐ ( ์ด๋๋ ์ค์ ์ ์ถํ x_test๋ฅผ ์ฌ์ฉํจ)
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 100, max_depth=5)
model.fit(x_train, np.ravel(y_train))
model_pred = model.predict(x_test)
result = pd.DataFrame({'id': x_test_id , 'charges':model_pred})#.to_csv('123.csv', index=False)
print(result)
rmse(y_test['charges'], model_pred)
2023.11.29 - [๐.๋น ๋ฐ์ดํฐ๋ถ์๊ธฐ์ฌ] - ๋น ๋ฐ์ดํฐ๋ถ์๊ธฐ์ฌ ์ค๊ธฐ ์์ ์ ๋ฆฌ๋ณธ
'๐.์๊ฒฉ์ฆ > ๐.๋น ๋ฐ์ดํฐ๋ถ์๊ธฐ์ฌ' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
[์ค๊ธฐ] ๋น ๋ฐ์ดํฐ ๋ถ์ ๊ธฐ์ฌ ์์ ํ 3 ์ ๋ฆฌ (Python) (0) | 2023.12.04 |
---|---|
[์ค๊ธฐ] ๋น ๋ฐ์ดํฐ ๋ถ์ ๊ธฐ์ฌ ์์ ํ 1 ์ ๋ฆฌ (Python) (0) | 2023.12.01 |
๋น ๋ฐ์ดํฐ๋ถ์๊ธฐ์ฌ ์ค๊ธฐ ์์ ์ ๋ฆฌ๋ณธ (0) | 2023.11.29 |
๋น ๋ฐ์ดํฐ๋ถ์๊ธฐ์ฌ ํ๊ธฐ ์ ๋ฆฌ๋ณธ (2) | 2023.10.17 |
๋น ๋ฐ์ดํฐ๋ถ์๊ธฐ์ฌ 7ํ ํ๊ธฐ ํฉ๊ฒฉ ํ๊ธฐ (๊ต์ฌ X) (2) | 2023.10.15 |