[데이터분석] eda
Load data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
activity = pd.read_csv("data/brazil_activity.csv")
/home/users/s17011720/.conda/envs/jupyter/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3166: DtypeWarning: Columns (1,2,3,40) have mixed types.Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
draw = pd.read_csv("data/brazil_draw.csv")
play = pd.read_csv("data/brazil_play.csv")
stage = pd.read_csv("data/brazil_stage.csv")
tutorial = pd.read_csv("data/brazil_tutorial.csv")
activity.head()
items | traffic_source_medium | traffic_source_source | traffic_source_name | event_previous_timestamp | ga_session_number | ga_session_number_timestamp | user_level | user_level_timestamp | frist_version | ... | activity_name | action_name | activity_id | action_id | firebase_event_origin | ga_session_number_1 | sub_name | user_first_touch_timestamp | stream_id | event_timestamp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | [] | notification | Firebase | 5.63939e+18 | 1.619310e+15 | 11.0 | 1.619309e+15 | 111 | 1619310355711000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 11.0 | firefighter | 1.619195e+15 | 2205916636 | 1619310355712000 |
1 | [] | notification | Firebase | 5.63939e+18 | 1.619311e+15 | 11.0 | 1.619309e+15 | 116 | 1619310716820000 | 1.5.2 | ... | content_button | close_collection | 3 | 3002 | app | 11.0 | 0 | 1.619195e+15 | 2205916636 | 1619310716820000 |
2 | [] | notification | Firebase | 5.63939e+18 | 1.619539e+15 | 16.0 | 1.619539e+15 | 135 | 1619538999255000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 16.0 | doctor | 1.619195e+15 | 2205916636 | 1619538999256000 |
3 | [] | notification | Firebase | 5.63939e+18 | 1.619309e+15 | 11.0 | 1.619309e+15 | 110 | 1619310280808000 | 1.5.2 | ... | content_button | open_collection | 3 | 3001 | app | 11.0 | 0 | 1.619195e+15 | 2205916636 | 1619310280808000 |
4 | [] | notification | Firebase | 5.63939e+18 | 1.619539e+15 | 16.0 | 1.619539e+15 | 135 | 1619539036765000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 16.0 | police | 1.619195e+15 | 2205916636 | 1619539036766000 |
5 rows × 88 columns
activity.shape
(1210979, 88)
Data Cleaning
See how many missing data points we have
missing_values_count = activity.isnull().sum()
missing_values_count[0:10]
items 0
traffic_source_medium 196002
traffic_source_source 196002
traffic_source_name 284690
event_previous_timestamp 160763
ga_session_number 207
ga_session_number_timestamp 207
user_level 0
user_level_timestamp 0
frist_version 28
dtype: int64
데이터 세트의 값 중 몇 퍼센트가 누락되었는지 확인
# how many total missing values do we have?
total_cells = np.product(activity.shape)
total_missing = missing_values_count.sum()
# percent of data that is missing
(total_missing/total_cells) * 100
18.352279436720206
데이터 세트에 있는 셀의 거의 1/5이 비어 있다
# Percentage of NAN Values
NAN = [(c, activity[c].isna().mean()*100) for c in activity]
NAN = pd.DataFrame(NAN, columns=["column_name", "percentage"])
# Features with more than 50% of missing values.
NAN = NAN[NAN.percentage > 50]
NAN.sort_values("percentage", ascending=False)
column_name | percentage | |
---|---|---|
23 | firebase_exp_4 | 100.000000 |
24 | firebase_exp_4_timestamp | 100.000000 |
39 | last_gclid | 100.000000 |
43 | _ltv_BRL | 99.998018 |
44 | _ltv_BRL_timestamp | 99.998018 |
41 | _ltv_USD | 99.982328 |
42 | _ltv_USD_timestamp | 99.982328 |
40 | last_gclid_1 | 99.619316 |
37 | firebase_last_notification | 95.956247 |
38 | firebase_last_notification_timestamp | 95.956247 |
33 | firebase_exp_2 | 87.531328 |
34 | firebase_exp_2_timestamp | 87.531328 |
35 | firebase_exp_3 | 87.530998 |
36 | firebase_exp_3_timestamp | 87.530998 |
NAN_cols = NAN['column_name'].to_list()
activity = activity.drop([NAN_cols],axis=1)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-44-0c0c5289f3ec> in <module>
----> 1 activity = activity.drop([NAN_cols],axis=1)
~/.conda/envs/jupyter/lib/python3.7/site-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors)
4172 level=level,
4173 inplace=inplace,
-> 4174 errors=errors,
4175 )
4176
~/.conda/envs/jupyter/lib/python3.7/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
3887 for axis, labels in axes.items():
3888 if labels is not None:
-> 3889 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
3890
3891 if inplace:
~/.conda/envs/jupyter/lib/python3.7/site-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors)
3921 new_axis = axis.drop(labels, level=level, errors=errors)
3922 else:
-> 3923 new_axis = axis.drop(labels, errors=errors)
3924 result = self.reindex(**{axis_name: new_axis})
3925
~/.conda/envs/jupyter/lib/python3.7/site-packages/pandas/core/indexes/base.py in drop(self, labels, errors)
5285 if mask.any():
5286 if errors != "ignore":
-> 5287 raise KeyError(f"{labels[mask]} not found in axis")
5288 indexer = indexer[~mask]
5289 return self.delete(indexer)
KeyError: "[('firebase_exp_4', 'firebase_exp_4_timestamp', 'firebase_exp_2', 'firebase_exp_2_timestamp', 'firebase_exp_3', 'firebase_exp_3_timestamp', 'firebase_last_notification', 'firebase_last_notification_timestamp', 'last_gclid', 'last_gclid_1', '_ltv_USD', '_ltv_USD_timestamp', '_ltv_BRL', '_ltv_BRL_timestamp')] not found in axis"
Figure out why the data is missing
누락된 값을 처리하려면 값이 누락된 이유를 파악하기 위해 직관을 사용해야 합니다. 이것을 알아내는 데 도움이 되도록 스스로에게 물어볼 수 있는 가장 중요한 질문 중 하나는 다음과 같습니다.
이 값이 기록되지 않았거나 존재하지 않아서 누락된 것입니까?
- 값이 존재하지 않아서 누락된 경우, 값이 무엇인지 추측하는 것은 의미가 없기 때문에 NaN 값으로 처리.
- 값이 기록되지 않아 누락된 경우, 해당 열과 행의 다른 값을 기반으로 했을 수 있는 값을 추측. (imputation)
Drop missing values
급하거나 값이 누락된 이유를 파악할 이유가 없는 경우, 누락된 값이 포함된 행이나 열을 제거할 수 있다. (참고 : 일반적으로 중요한 프로젝트에는 이 접근 방식을 권장하지 않는다. 일반적으로 데이터를 살펴보고 누락된 값이 있는 모든 열을 하나씩 살펴보고 실제로 데이터 세트를 파악하는 데 시간을 할애하는 것이 좋다.)
누락된 값이 있는 행을 삭제할 경우, dropna().
activity.dropna()
items | traffic_source_medium | traffic_source_source | traffic_source_name | event_previous_timestamp | ga_session_number | ga_session_number_timestamp | user_level | user_level_timestamp | frist_version | ... | activity_name | action_name | activity_id | action_id | firebase_event_origin | ga_session_number_1 | sub_name | user_first_touch_timestamp | stream_id | event_timestamp |
---|
0 rows × 88 columns
모든 데이터가 제거되었다. 이는 데이터 세트의 모든 행에 하나 이상의 결측값이 있기 때문이다. 대신 하나 이상의 결측값이 있는 모든 열을 제거하는 것이 더 나을 수 있습니다.
columns_with_na_dropped = activity.dropna(axis=1)
columns_with_na_dropped.head()
items | user_level | user_level_timestamp | user_pseudo_id | event_bundle_sequence_id | platform | event_server_timestamp_offset | app_info_install_source | app_info_firebase_app_id | app_info_version | ... | geo_continent | sub_id | activity_name | action_name | activity_id | action_id | firebase_event_origin | sub_name | stream_id | event_timestamp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | [] | 111 | 1619310355711000 | 8da97c0b43305c41d78cc0f5585d6b9a | 1572 | ANDROID | 1651059 | com.android.vending | 1:142569243604:android:31c6a5c81e681fd8210c41 | 1.5.2 | ... | Americas | 1008 | content_button | click_skin | 3 | 3003 | app | firefighter | 2205916636 | 1619310355712000 |
1 | [] | 116 | 1619310716820000 | 8da97c0b43305c41d78cc0f5585d6b9a | 1647 | ANDROID | 1681114 | com.android.vending | 1:142569243604:android:31c6a5c81e681fd8210c41 | 1.5.2 | ... | Americas | 0 | content_button | close_collection | 3 | 3002 | app | 0 | 2205916636 | 1619310716820000 |
2 | [] | 135 | 1619538999255000 | 8da97c0b43305c41d78cc0f5585d6b9a | 2047 | ANDROID | 1436640 | com.android.vending | 1:142569243604:android:31c6a5c81e681fd8210c41 | 1.5.2 | ... | Americas | 1005 | content_button | click_skin | 3 | 3003 | app | doctor | 2205916636 | 1619538999256000 |
3 | [] | 110 | 1619310280808000 | 8da97c0b43305c41d78cc0f5585d6b9a | 1554 | ANDROID | 1643764 | com.android.vending | 1:142569243604:android:31c6a5c81e681fd8210c41 | 1.5.2 | ... | Americas | 0 | content_button | open_collection | 3 | 3001 | app | 0 | 2205916636 | 1619310280808000 |
4 | [] | 135 | 1619539036765000 | 8da97c0b43305c41d78cc0f5585d6b9a | 2068 | ANDROID | 1425169 | com.android.vending | 1:142569243604:android:31c6a5c81e681fd8210c41 | 1.5.2 | ... | Americas | 1007 | content_button | click_skin | 3 | 3003 | app | police | 2205916636 | 1619539036766000 |
5 rows × 33 columns
print("Columns in original dataset: %d \n" % activity.shape[1])
print("Columns with na's dropped: %d" % columns_with_na_dropped.shape[1])
Columns in original dataset: 88
Columns with na's dropped: 33
columns_with_na_dropped.columns
Index(['items', 'user_level', 'user_level_timestamp', 'user_pseudo_id',
'event_bundle_sequence_id', 'platform', 'event_server_timestamp_offset',
'app_info_install_source', 'app_info_firebase_app_id',
'app_info_version', 'app_info_id', 'event_date', 'event_name',
'device_time_zone_offset_seconds', 'device_operating_system',
'device_mobile_os_hardware_model', 'device_is_limited_ad_tracking',
'device_operating_system_version', 'device_language', 'device_category',
'geo_metro', 'geo_sub_continent', 'geo_country', 'geo_continent',
'sub_id', 'activity_name', 'action_name', 'activity_id', 'action_id',
'firebase_event_origin', 'sub_name', 'stream_id', 'event_timestamp'],
dtype='object')
# 전체가 결측값인 행 또는 열만 삭제
activity.dropna(how='all', axis=1)
items | traffic_source_medium | traffic_source_source | traffic_source_name | event_previous_timestamp | ga_session_number | ga_session_number_timestamp | user_level | user_level_timestamp | frist_version | ... | activity_name | action_name | activity_id | action_id | firebase_event_origin | ga_session_number_1 | sub_name | user_first_touch_timestamp | stream_id | event_timestamp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | [] | notification | Firebase | 5.63939e+18 | 1.619310e+15 | 11.0 | 1.619309e+15 | 111 | 1619310355711000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 11.0 | firefighter | 1.619195e+15 | 2205916636 | 1619310355712000 |
1 | [] | notification | Firebase | 5.63939e+18 | 1.619311e+15 | 11.0 | 1.619309e+15 | 116 | 1619310716820000 | 1.5.2 | ... | content_button | close_collection | 3 | 3002 | app | 11.0 | 0 | 1.619195e+15 | 2205916636 | 1619310716820000 |
2 | [] | notification | Firebase | 5.63939e+18 | 1.619539e+15 | 16.0 | 1.619539e+15 | 135 | 1619538999255000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 16.0 | doctor | 1.619195e+15 | 2205916636 | 1619538999256000 |
3 | [] | notification | Firebase | 5.63939e+18 | 1.619309e+15 | 11.0 | 1.619309e+15 | 110 | 1619310280808000 | 1.5.2 | ... | content_button | open_collection | 3 | 3001 | app | 11.0 | 0 | 1.619195e+15 | 2205916636 | 1619310280808000 |
4 | [] | notification | Firebase | 5.63939e+18 | 1.619539e+15 | 16.0 | 1.619539e+15 | 135 | 1619539036765000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 16.0 | police | 1.619195e+15 | 2205916636 | 1619539036766000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1210974 | [] | notification | Firebase | 5639386036271637485 | 1.619539e+15 | 16.0 | 1.619539e+15 | 135 | 1619539024856000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 16.0 | basic | 1.619195e+15 | 2205916636 | 1619539024856000 |
1210975 | [] | notification | Firebase | 5639386036271637485 | 1.619309e+15 | 11.0 | 1.619309e+15 | 107 | 1619308894406000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 11.0 | worker | 1.619195e+15 | 2205916636 | 1619308894406000 |
1210976 | [] | notification | Firebase | 5639386036271637485 | 1.619550e+15 | 2.0 | 1.619550e+15 | 16 | 1619550368873000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 2.0 | basic | 1.619197e+15 | 2205916636 | 1619550368873000 |
1210977 | [] | notification | Firebase | 5639386036271637485 | 1.619550e+15 | 2.0 | 1.619550e+15 | 16 | 1619550361714000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 2.0 | pringles | 1.619197e+15 | 2205916636 | 1619550361715000 |
1210978 | [] | notification | Firebase | 5639386036271637485 | 1.619550e+15 | 2.0 | 1.619550e+15 | 18 | 1619550449656000 | 1.5.2 | ... | content_button | open_collection | 3 | 3001 | app | 2.0 | 0 | 1.619197e+15 | 2205916636 | 1619550449656000 |
1210979 rows × 85 columns
activity.dropna(how='all', axis=0)
items | traffic_source_medium | traffic_source_source | traffic_source_name | event_previous_timestamp | ga_session_number | ga_session_number_timestamp | user_level | user_level_timestamp | frist_version | ... | activity_name | action_name | activity_id | action_id | firebase_event_origin | ga_session_number_1 | sub_name | user_first_touch_timestamp | stream_id | event_timestamp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | [] | notification | Firebase | 5.63939e+18 | 1.619310e+15 | 11.0 | 1.619309e+15 | 111 | 1619310355711000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 11.0 | firefighter | 1.619195e+15 | 2205916636 | 1619310355712000 |
1 | [] | notification | Firebase | 5.63939e+18 | 1.619311e+15 | 11.0 | 1.619309e+15 | 116 | 1619310716820000 | 1.5.2 | ... | content_button | close_collection | 3 | 3002 | app | 11.0 | 0 | 1.619195e+15 | 2205916636 | 1619310716820000 |
2 | [] | notification | Firebase | 5.63939e+18 | 1.619539e+15 | 16.0 | 1.619539e+15 | 135 | 1619538999255000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 16.0 | doctor | 1.619195e+15 | 2205916636 | 1619538999256000 |
3 | [] | notification | Firebase | 5.63939e+18 | 1.619309e+15 | 11.0 | 1.619309e+15 | 110 | 1619310280808000 | 1.5.2 | ... | content_button | open_collection | 3 | 3001 | app | 11.0 | 0 | 1.619195e+15 | 2205916636 | 1619310280808000 |
4 | [] | notification | Firebase | 5.63939e+18 | 1.619539e+15 | 16.0 | 1.619539e+15 | 135 | 1619539036765000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 16.0 | police | 1.619195e+15 | 2205916636 | 1619539036766000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1210974 | [] | notification | Firebase | 5639386036271637485 | 1.619539e+15 | 16.0 | 1.619539e+15 | 135 | 1619539024856000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 16.0 | basic | 1.619195e+15 | 2205916636 | 1619539024856000 |
1210975 | [] | notification | Firebase | 5639386036271637485 | 1.619309e+15 | 11.0 | 1.619309e+15 | 107 | 1619308894406000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 11.0 | worker | 1.619195e+15 | 2205916636 | 1619308894406000 |
1210976 | [] | notification | Firebase | 5639386036271637485 | 1.619550e+15 | 2.0 | 1.619550e+15 | 16 | 1619550368873000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 2.0 | basic | 1.619197e+15 | 2205916636 | 1619550368873000 |
1210977 | [] | notification | Firebase | 5639386036271637485 | 1.619550e+15 | 2.0 | 1.619550e+15 | 16 | 1619550361714000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 2.0 | pringles | 1.619197e+15 | 2205916636 | 1619550361715000 |
1210978 | [] | notification | Firebase | 5639386036271637485 | 1.619550e+15 | 2.0 | 1.619550e+15 | 18 | 1619550449656000 | 1.5.2 | ... | content_button | open_collection | 3 | 3001 | app | 2.0 | 0 | 1.619197e+15 | 2205916636 | 1619550449656000 |
1210979 rows × 88 columns
thresh : optional 변수로 not-nan data 즉 value 값이 존재하는 nan이 아닌 데이터가 몇 개면 삭제하지 않을 것인지를 정하는 변수이다.
# 임계치 설정해서 제거하기
activity.dropna(thresh=10, axis=1)
items | traffic_source_medium | traffic_source_source | traffic_source_name | event_previous_timestamp | ga_session_number | ga_session_number_timestamp | user_level | user_level_timestamp | frist_version | ... | activity_name | action_name | activity_id | action_id | firebase_event_origin | ga_session_number_1 | sub_name | user_first_touch_timestamp | stream_id | event_timestamp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | [] | notification | Firebase | 5.63939e+18 | 1.619310e+15 | 11.0 | 1.619309e+15 | 111 | 1619310355711000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 11.0 | firefighter | 1.619195e+15 | 2205916636 | 1619310355712000 |
1 | [] | notification | Firebase | 5.63939e+18 | 1.619311e+15 | 11.0 | 1.619309e+15 | 116 | 1619310716820000 | 1.5.2 | ... | content_button | close_collection | 3 | 3002 | app | 11.0 | 0 | 1.619195e+15 | 2205916636 | 1619310716820000 |
2 | [] | notification | Firebase | 5.63939e+18 | 1.619539e+15 | 16.0 | 1.619539e+15 | 135 | 1619538999255000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 16.0 | doctor | 1.619195e+15 | 2205916636 | 1619538999256000 |
3 | [] | notification | Firebase | 5.63939e+18 | 1.619309e+15 | 11.0 | 1.619309e+15 | 110 | 1619310280808000 | 1.5.2 | ... | content_button | open_collection | 3 | 3001 | app | 11.0 | 0 | 1.619195e+15 | 2205916636 | 1619310280808000 |
4 | [] | notification | Firebase | 5.63939e+18 | 1.619539e+15 | 16.0 | 1.619539e+15 | 135 | 1619539036765000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 16.0 | police | 1.619195e+15 | 2205916636 | 1619539036766000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1210974 | [] | notification | Firebase | 5639386036271637485 | 1.619539e+15 | 16.0 | 1.619539e+15 | 135 | 1619539024856000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 16.0 | basic | 1.619195e+15 | 2205916636 | 1619539024856000 |
1210975 | [] | notification | Firebase | 5639386036271637485 | 1.619309e+15 | 11.0 | 1.619309e+15 | 107 | 1619308894406000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 11.0 | worker | 1.619195e+15 | 2205916636 | 1619308894406000 |
1210976 | [] | notification | Firebase | 5639386036271637485 | 1.619550e+15 | 2.0 | 1.619550e+15 | 16 | 1619550368873000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 2.0 | basic | 1.619197e+15 | 2205916636 | 1619550368873000 |
1210977 | [] | notification | Firebase | 5639386036271637485 | 1.619550e+15 | 2.0 | 1.619550e+15 | 16 | 1619550361714000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 2.0 | pringles | 1.619197e+15 | 2205916636 | 1619550361715000 |
1210978 | [] | notification | Firebase | 5639386036271637485 | 1.619550e+15 | 2.0 | 1.619550e+15 | 18 | 1619550449656000 | 1.5.2 | ... | content_button | open_collection | 3 | 3001 | app | 2.0 | 0 | 1.619197e+15 | 2205916636 | 1619550449656000 |
1210979 rows × 85 columns
# 특정 컬럼 내의 결측치만 삭제하려고 할 때
activity.dropna(subset=['items', 'ga_session_number'])
items | traffic_source_medium | traffic_source_source | traffic_source_name | event_previous_timestamp | ga_session_number | ga_session_number_timestamp | user_level | user_level_timestamp | frist_version | ... | activity_name | action_name | activity_id | action_id | firebase_event_origin | ga_session_number_1 | sub_name | user_first_touch_timestamp | stream_id | event_timestamp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | [] | notification | Firebase | 5.63939e+18 | 1.619310e+15 | 11.0 | 1.619309e+15 | 111 | 1619310355711000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 11.0 | firefighter | 1.619195e+15 | 2205916636 | 1619310355712000 |
1 | [] | notification | Firebase | 5.63939e+18 | 1.619311e+15 | 11.0 | 1.619309e+15 | 116 | 1619310716820000 | 1.5.2 | ... | content_button | close_collection | 3 | 3002 | app | 11.0 | 0 | 1.619195e+15 | 2205916636 | 1619310716820000 |
2 | [] | notification | Firebase | 5.63939e+18 | 1.619539e+15 | 16.0 | 1.619539e+15 | 135 | 1619538999255000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 16.0 | doctor | 1.619195e+15 | 2205916636 | 1619538999256000 |
3 | [] | notification | Firebase | 5.63939e+18 | 1.619309e+15 | 11.0 | 1.619309e+15 | 110 | 1619310280808000 | 1.5.2 | ... | content_button | open_collection | 3 | 3001 | app | 11.0 | 0 | 1.619195e+15 | 2205916636 | 1619310280808000 |
4 | [] | notification | Firebase | 5.63939e+18 | 1.619539e+15 | 16.0 | 1.619539e+15 | 135 | 1619539036765000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 16.0 | police | 1.619195e+15 | 2205916636 | 1619539036766000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1210974 | [] | notification | Firebase | 5639386036271637485 | 1.619539e+15 | 16.0 | 1.619539e+15 | 135 | 1619539024856000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 16.0 | basic | 1.619195e+15 | 2205916636 | 1619539024856000 |
1210975 | [] | notification | Firebase | 5639386036271637485 | 1.619309e+15 | 11.0 | 1.619309e+15 | 107 | 1619308894406000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 11.0 | worker | 1.619195e+15 | 2205916636 | 1619308894406000 |
1210976 | [] | notification | Firebase | 5639386036271637485 | 1.619550e+15 | 2.0 | 1.619550e+15 | 16 | 1619550368873000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 2.0 | basic | 1.619197e+15 | 2205916636 | 1619550368873000 |
1210977 | [] | notification | Firebase | 5639386036271637485 | 1.619550e+15 | 2.0 | 1.619550e+15 | 16 | 1619550361714000 | 1.5.2 | ... | content_button | click_skin | 3 | 3003 | app | 2.0 | pringles | 1.619197e+15 | 2205916636 | 1619550361715000 |
1210978 | [] | notification | Firebase | 5639386036271637485 | 1.619550e+15 | 2.0 | 1.619550e+15 | 18 | 1619550449656000 | 1.5.2 | ... | content_button | open_collection | 3 | 3001 | app | 2.0 | 0 | 1.619197e+15 | 2205916636 | 1619550449656000 |
1210772 rows × 88 columns
Filling in missing values automatically
Handling missing data
def handling_missing_data(df):
# 데이터 세트의 값 중 몇 퍼센트가 누락되었는지 확인
print("how many total missing values do we have?")
total_cells = np.product(df.shape)
total_missing = missing_values_count.sum()
print(round((total_missing/total_cells) * 100, 2), "%")
#
columns_with_na_dropped = activity.dropna(axis=1)
columns_with_na_dropped.head()
Scaling and normalization
from scipy import stats
from mlxtend.preprocessing import minmax_scaling
import seaborn as sns
import matplotlib.pyplot as plt
# set seed for reproducibility
np.random.seed(0)
cat_cols = activity.select_dtypes(exclude=['int64','float64']).columns
cat_cols
Index(['items', 'traffic_source_medium', 'traffic_source_source',
'traffic_source_name', 'frist_version', 'cuid', 'ad_id', 'last_gclid_1',
'user_pseudo_id', 'platform', 'app_info_install_source',
'app_info_firebase_app_id', 'app_info_version', 'app_info_id',
'event_name', 'device_operating_system',
'device_mobile_os_hardware_model', 'device_mobile_marketing_name',
'device_is_limited_ad_tracking', 'device_advertising_id',
'device_operating_system_version', 'device_mobile_model_name',
'device_language', 'device_mobile_brand_name', 'device_category',
'geo_metro', 'geo_sub_continent', 'geo_city', 'geo_region',
'geo_country', 'geo_continent', 'ad_id_1', 'firebase_screen_class',
'activity_name', 'action_name', 'firebase_event_origin', 'sub_name'],
dtype='object')
len(cat_cols)
37
num_colms = activity._get_numeric_data()
num_colms.head()
event_previous_timestamp | ga_session_number | ga_session_number_timestamp | user_level | user_level_timestamp | frist_version_timestamp | cuid_timestamp | firebase_exp_8 | firebase_exp_8_timestamp | firebase_exp_10 | ... | ga_session_id_1 | firebase_screen_id | sub_id | engaged_session_event | activity_id | action_id | ga_session_number_1 | user_first_touch_timestamp | stream_id | event_timestamp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.619310e+15 | 11.0 | 1.619309e+15 | 111 | 1619310355711000 | 1.619310e+15 | 1.619310e+15 | 1.0 | 1.619195e+15 | 0.0 | ... | 1.619309e+09 | -8.255892e+18 | 1008 | 1.0 | 3 | 3003 | 11.0 | 1.619195e+15 | 2205916636 | 1619310355712000 |
1 | 1.619311e+15 | 11.0 | 1.619309e+15 | 116 | 1619310716820000 | 1.619310e+15 | 1.619310e+15 | 1.0 | 1.619195e+15 | 0.0 | ... | 1.619309e+09 | -8.255892e+18 | 0 | 1.0 | 3 | 3002 | 11.0 | 1.619195e+15 | 2205916636 | 1619310716820000 |
2 | 1.619539e+15 | 16.0 | 1.619539e+15 | 135 | 1619538999255000 | 1.619539e+15 | 1.619539e+15 | 1.0 | 1.619195e+15 | 0.0 | ... | 1.619539e+09 | 2.995595e+18 | 1005 | 1.0 | 3 | 3003 | 16.0 | 1.619195e+15 | 2205916636 | 1619538999256000 |
3 | 1.619309e+15 | 11.0 | 1.619309e+15 | 110 | 1619310280808000 | 1.619310e+15 | 1.619310e+15 | 1.0 | 1.619195e+15 | 0.0 | ... | 1.619309e+09 | -8.255892e+18 | 0 | 1.0 | 3 | 3001 | 11.0 | 1.619195e+15 | 2205916636 | 1619310280808000 |
4 | 1.619539e+15 | 16.0 | 1.619539e+15 | 135 | 1619539036765000 | 1.619539e+15 | 1.619539e+15 | 1.0 | 1.619195e+15 | 0.0 | ... | 1.619539e+09 | 2.995595e+18 | 1007 | 1.0 | 3 | 3003 | 16.0 | 1.619195e+15 | 2205916636 | 1619539036766000 |
5 rows × 52 columns
activity.nunique()
items 1
traffic_source_medium 5
traffic_source_source 5
traffic_source_name 7
event_previous_timestamp 1042383
...
ga_session_number_1 44
sub_name 10
user_first_touch_timestamp 160848
stream_id 1
event_timestamp 1202693
Length: 88, dtype: int64
.drop_duplicates()
댓글남기기