본문 바로가기
Data/[스파르타 내일배움캠프]

[TIL]본캠프 42일차(심화 프로젝트 1일차)

by MIN_JOON 2024. 6. 17.
#라이브러리 임포트
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

#csv파일 로드
customers = pd.read_csv('/content/drive/MyDrive/내배캠 심화 프로젝트/customers.csv')
capstone_data_cleaned = pd.read_csv('/content/drive/MyDrive/내배캠 심화 프로젝트/capstone_data_cleaned.csv')
order_items = pd.read_csv('/content/drive/MyDrive/내배캠 심화 프로젝트/order_items.csv')
orders = pd.read_csv('/content/drive/MyDrive/내배캠 심화 프로젝트/orders.csv')
payments = pd.read_csv('/content/drive/MyDrive/내배캠 심화 프로젝트/payments.csv')
products = pd.read_csv('/content/drive/MyDrive/내배캠 심화 프로젝트/products.csv')

#각 테이블 결측치 제거
customers_df = customers.dropna()
order_items_df = order_items.dropna()
orders_df = orders.dropna()
payments_df = payments.dropna()
products_df = products.dropna()

#테이블 합치기
main_data = pd.merge(orders_df , customers_df , how='inner' , on='customer_id')
main_data = pd.merge(main_data , order_items_df , how='inner' , on='order_id')
main_data = pd.merge(main_data , payments_df , how='inner' , on='order_id')
main_data = pd.merge(main_data , products_df , how='inner' , on='product_id')

#메인 테이블 null 제거(재확인)
main_data = main_data.dropna()

#order_status 히스토그램
sns.countplot(x='order_status' , data=main_data);

#시간 삭제하고 년-월-일만 남기기
main_data['order_purchase_timestamp'] = main_data['order_purchase_timestamp'].astype('str').str.split(' ').str[0]
main_data['order_delivered_timestamp'] = main_data['order_delivered_timestamp'].astype('str').str.split(' ').str[0]
main_data['order_approved_at'] = main_data['order_approved_at'].astype('str').str.split(' ').str[0]
main_data['order_estimated_delivery_date'] = main_data['order_estimated_delivery_date'].astype('str').str.split(' ').str[0]

#일별 총 구매 금액 시각화
daily_purchase = main_data.groupby('order_purchase_timestamp',as_index=False)['price'].sum()
daily_purchase['order_purchase_timestamp'] = pd.to_datetime(daily_purchase['order_purchase_timestamp'])
sns.lineplot(x='order_purchase_timestamp',y='price',data=daily_purchase);

#지불 방식 카운트
sns.countplot(data=main_data,x='payment_type');

#일별 운송비 추이
daily_shipping_charges = main_data.groupby('order_purchase_timestamp',as_index=False).agg(shipping_charges=('shipping_charges','sum'))
daily_shipping_charges['order_purchase_timestamp'] = pd.to_datetime(daily_shipping_charges['order_purchase_timestamp'])
sns.lineplot(x='order_purchase_timestamp',y='shipping_charges',data=daily_shipping_charges);
  • 시각화 자료들
    시각화