- 신규 구매자들의 클러스터링 진행(최종 프로젝트)
- 클러스터링을 위해 한 고객 당 특성으로 이루어진 테이블로 전처리
- 전처리를 통해 생성된 테이블로 클러스터링 진행
- 클러스터 수 : 4개, 주성분 수 : 2개
#2023년 기준으로 고객별 특성 테이블 생성
customer_clustering = new_2023.copy()
customer_clustering['return_status'] = customer_clustering['return_status'].fillna(0)
customer_clustering['return_status'] = np.where(customer_clustering['return_status'] == 0,0,1)
customer_clustering['order_date'] = customer_clustering['order_date']
customer_clustering = customer_clustering.groupby('customer_id').agg(first_order=('order_date','min'),\
last_order=('order_date','max'),\
created_at=('created_at','min'),\
total_order_count=('order_id','count'),\
total_price=('total_price','sum'),\
category_count=('category','nunique'),\
product_count=('product_name','nunique'),\
avg_unit_price=('unit_price','mean'),\
return_count = ('return_status','sum'),\
avg_quantity = ('quantity','mean')).reset_index()
#구매 주기 파악
order_frequency = new_2023[['customer_id','order_date']].sort_values(['customer_id','order_date'],ascending=[True,True])
order_frequency['before_order'] = order_frequency.sort_values('order_date',ascending=True).groupby('customer_id')['order_date'].shift(1)
order_frequency['frequency'] = (order_frequency['order_date'] - order_frequency['before_order']).dt.days
order_frequency = order_frequency.groupby('customer_id')['frequency'].mean().round(0).reset_index().rename(columns={'frequency':'avg_purchase_cycle'})
#테이블 병합
customer_clustering = pd.merge(customer_clustering,order_frequency,on='customer_id',how='left')
customer_clustering = pd.merge(customer_clustering,new_2023[new_2023['return_status'].isna() == False].groupby('customer_id')['category'].nunique().reset_index().rename(columns={'category':'return_category_count'}),how='left',on='customer_id')
customer_clustering['return_category_count'] = customer_clustering['return_category_count'].fillna(0)
customer_clustering['create_to_first_order'] = (customer_clustering['first_order'] - customer_clustering['created_at']).dt.days
customer_clustering['avg_purchase_cycle'] = customer_clustering['avg_purchase_cycle'].fillna(0)
customer_clustering
#PCA(주성분 분석) n = 2
pca_main = customer_clustering.copy()
#날짜변수 처리
pca_main['first_order_month'] = pca_main['first_order'].dt.month
pca_main['last_order_month'] = pca_main['last_order'].dt.month
pca_main['created_month'] = pca_main['created_at'].dt.month
#컬럼 선택
feature_names = ['first_order_month', 'last_order_month', 'created_month','total_order_count', 'total_price', 'category_count', 'product_count','avg_unit_price', 'return_count', 'avg_quantity', 'avg_purchase_cycle','return_category_count', 'create_to_first_order']
pca_sample = pca_main[feature_names]
#log_scale 변환
pca_sample['total_order_count'] = np.log1p(pca_sample['total_order_count'])
pca_sample['total_price'] = np.log1p(pca_sample['total_price'])
pca_sample['category_count'] = np.log1p(pca_sample['category_count'])
pca_sample['product_count'] = np.log1p(pca_sample['product_count'])
pca_sample['avg_unit_price'] = np.log1p(pca_sample['avg_unit_price'])
pca_sample['return_count'] = np.log1p(pca_sample['return_count'])
pca_sample['avg_purchase_cycle'] = np.log1p(pca_sample['avg_purchase_cycle'])
pca_sample['return_category_count'] = np.log1p(pca_sample['return_category_count'])
pca_sample['create_to_first_order'] = np.log1p(pca_sample['create_to_first_order'])
#정규화
scaler = StandardScaler()
pca_sample_scaled = scaler.fit_transform(pca_sample)
#PCA 실행
pca = PCA(n_components=2)
printcipalComponents = pca.fit_transform(pca_sample_scaled)
principal_df = pd.DataFrame(data=printcipalComponents, columns = ['principal component1', 'principal component2'])
principal_df.head(5)
#설명력 체크
print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))
#클러스터링
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42, init = 'k-means++')
clusters = kmeans.fit_predict(principal_df)
principal_df['cluster'] = kmeans.labels_
# kmeans 시각화
plt.figure(figsize=(15, 10))
sns.scatterplot(data=principal_df, x='principal component1', y='principal component2', hue='cluster', palette='viridis')
plt.title('KMeans Clustering Results')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.show()
'Data > [스파르타 내일배움캠프]' 카테고리의 다른 글
[TIL]본캠프 77일차 & 78일차 (0) | 2024.08.06 |
---|---|
[WIL]본캠프 16주차 (0) | 2024.08.02 |
[TIL]본캠프 75일차 (0) | 2024.08.01 |
[TIL]본캠프 74일차 (2) | 2024.07.31 |
[TIL]본캠프 72일차&73일차 (0) | 2024.07.30 |