
#고객이 속한 세그멘트를 예측하고,
#해당 예측 결과를 csv 파일로 저장하시오
import pandas as pd
X_test = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/CS_Seg_X_test.csv')
X_train = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/CS_Seg_X_train.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/CS_Seg_y_train.csv')
#print(X_train.head())
# ID Gender Ever_Married Age Graduated Profession Work_Experience Spending_Score Family_Size
#0 462809 Male No 22 No Healthcare 1.0 Low 4.0
#1 466315 Female Yes 67 Yes Engineer 1.0 Low 1.0
#2 461735 Male Yes 67 Yes Lawyer 0.0 High 2.0
#3 461319 Male Yes 56 No Artist 0.0 Average 2.0
#4 460156 Male No 32 Yes Healthcare 1.0 Low 3.0
# null이 있는지 확인하자
# 항상 보면 train는 세트로 있고, test는 세트로 없다
print(f' X_train : {X_train.isnull().sum().sum()}')
print(f' y_train : {X_train.isnull().sum().sum()}')
print(f' X_test : {X_train.isnull().sum().sum()}')
# 데이터 나누기
# 범주형 데이터
X_train_word = X_train[['Gender','Ever_Married','Graduated','Profession','Spending_Score']]
X_test_word = X_test[['Gender','Ever_Married','Graduated','Profession','Spending_Score']]
#print(X_train_word )
#수치형 데이터
X_train_num = X_train.drop(columns=['ID','Gender','Ever_Married','Graduated',
'Profession','Spending_Score'])
X_test_num = X_train.drop(columns=['ID','Gender','Ever_Married','Graduated',
'Profession','Spending_Score'])
print(X_train_num )
#데이터 스케일링
from sklearn.preprocessing import MinMaxScaler
#MinMax 스케일러 생성
scaler = MinMaxScaler()
#선택한 특성에 MinMax 스케일러를 적용하고 데이터 전환
X_train_num_scale = scaler.fit_transform(X_train_num)
X_test_num_scale = scaler.transform(X_test_num)
#테이터 프레임 설정
df_train_num = pd.DataFrame(X_train_num_scale, columns = X_train_num.columns)
df_test_num = pd.DataFrame(X_test_num_scale, columns = X_test_num.columns)
# 원핫 인코딩
df_train_word = pd.get_dummies(X_train_word)
df_test_word = pd.get_dummies(X_test_word)
#데이터결합
df_train = pd.concat([df_train_num, df_train_word], axis= 1)
df_test = pd.concat([df_test_num, df_train_word], axis= 1)
print(df_train)
from pandas.core.common import random_state
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
#target 값 변경
# 특정 항목값 변경하기
y_train[y_train['Segmentation'] == 'A'] = 0
y_train[y_train['Segmentation'] == 'B'] = 1
y_train[y_train['Segmentation'] == 'C'] = 2
y_train[y_train['Segmentation'] == 'D'] = 3
#모델 생성
model = xgb.XGBClassifier(random_state=77)
#Train, validation 데이터 설정
X_train, X_val, y_train, y_val = train_test_split(df_train.values, y_train['Segmentation'].values
,test_size=0.3)
#모델 학습
model.fit(X_train, y_train_t)
#validation 데이터로 성능 평가
y_pred = model.predict(X_val)
print(classification_report(y_val.astype(int), y_pred))
#예측
y_pred = model.predict(df_test)
df = pd.DataFrame(y_train['ID'], columns=['ID'])
df['Segmentation'] = y_pred
# Segmentation 데이터를 숫자에서 문자로 수정
df['Segmentation'][df['Segmentation']==0] = 'A'
df['Segmentation'][df['Segmentation']==0] = 'B'
df['Segmentation'][df['Segmentation']==0] = 'C'
df['Segmentation'][df['Segmentation']==0] = 'D'
#CSV 파일 저장
df.to_csv('result.csv', index=False)