Load data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
activity = pd.read_csv("data/brazil_activity.csv")
/home/users/s17011720/.conda/envs/jupyter/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3166: DtypeWarning: Columns (1,2,3,40) have mixed types.Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
draw = pd.read_csv("data/brazil_draw.csv")
play = pd.read_csv("data/brazil_play.csv")
stage = pd.read_csv("data/brazil_stage.csv")
tutorial = pd.read_csv("data/brazil_tutorial.csv")
activity.head()
items traffic_source_medium traffic_source_source traffic_source_name event_previous_timestamp ga_session_number ga_session_number_timestamp user_level user_level_timestamp frist_version ... activity_name action_name activity_id action_id firebase_event_origin ga_session_number_1 sub_name user_first_touch_timestamp stream_id event_timestamp
0 [] notification Firebase 5.63939e+18 1.619310e+15 11.0 1.619309e+15 111 1619310355711000 1.5.2 ... content_button click_skin 3 3003 app 11.0 firefighter 1.619195e+15 2205916636 1619310355712000
1 [] notification Firebase 5.63939e+18 1.619311e+15 11.0 1.619309e+15 116 1619310716820000 1.5.2 ... content_button close_collection 3 3002 app 11.0 0 1.619195e+15 2205916636 1619310716820000
2 [] notification Firebase 5.63939e+18 1.619539e+15 16.0 1.619539e+15 135 1619538999255000 1.5.2 ... content_button click_skin 3 3003 app 16.0 doctor 1.619195e+15 2205916636 1619538999256000
3 [] notification Firebase 5.63939e+18 1.619309e+15 11.0 1.619309e+15 110 1619310280808000 1.5.2 ... content_button open_collection 3 3001 app 11.0 0 1.619195e+15 2205916636 1619310280808000
4 [] notification Firebase 5.63939e+18 1.619539e+15 16.0 1.619539e+15 135 1619539036765000 1.5.2 ... content_button click_skin 3 3003 app 16.0 police 1.619195e+15 2205916636 1619539036766000

5 rows × 88 columns

activity.shape
(1210979, 88)

Data Cleaning

See how many missing data points we have

missing_values_count = activity.isnull().sum()
missing_values_count[0:10]
items                               0
traffic_source_medium          196002
traffic_source_source          196002
traffic_source_name            284690
event_previous_timestamp       160763
ga_session_number                 207
ga_session_number_timestamp       207
user_level                          0
user_level_timestamp                0
frist_version                      28
dtype: int64

데이터 세트의 값 중 몇 퍼센트가 누락되었는지 확인

# how many total missing values do we have?
total_cells = np.product(activity.shape)
total_missing = missing_values_count.sum()

# percent of data that is missing
(total_missing/total_cells) * 100
18.352279436720206

데이터 세트에 있는 셀의 거의 1/5이 비어 있다

# Percentage of NAN Values 
NAN = [(c, activity[c].isna().mean()*100) for c in activity]
NAN = pd.DataFrame(NAN, columns=["column_name", "percentage"])
# Features with more than 50% of missing values.
NAN = NAN[NAN.percentage > 50]
NAN.sort_values("percentage", ascending=False)
column_name percentage
23 firebase_exp_4 100.000000
24 firebase_exp_4_timestamp 100.000000
39 last_gclid 100.000000
43 _ltv_BRL 99.998018
44 _ltv_BRL_timestamp 99.998018
41 _ltv_USD 99.982328
42 _ltv_USD_timestamp 99.982328
40 last_gclid_1 99.619316
37 firebase_last_notification 95.956247
38 firebase_last_notification_timestamp 95.956247
33 firebase_exp_2 87.531328
34 firebase_exp_2_timestamp 87.531328
35 firebase_exp_3 87.530998
36 firebase_exp_3_timestamp 87.530998
NAN_cols = NAN['column_name'].to_list()
activity = activity.drop([NAN_cols],axis=1)
---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

<ipython-input-44-0c0c5289f3ec> in <module>
----> 1 activity = activity.drop([NAN_cols],axis=1)


~/.conda/envs/jupyter/lib/python3.7/site-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors)
   4172             level=level,
   4173             inplace=inplace,
-> 4174             errors=errors,
   4175         )
   4176 


~/.conda/envs/jupyter/lib/python3.7/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
   3887         for axis, labels in axes.items():
   3888             if labels is not None:
-> 3889                 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
   3890 
   3891         if inplace:


~/.conda/envs/jupyter/lib/python3.7/site-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors)
   3921                 new_axis = axis.drop(labels, level=level, errors=errors)
   3922             else:
-> 3923                 new_axis = axis.drop(labels, errors=errors)
   3924             result = self.reindex(**{axis_name: new_axis})
   3925 


~/.conda/envs/jupyter/lib/python3.7/site-packages/pandas/core/indexes/base.py in drop(self, labels, errors)
   5285         if mask.any():
   5286             if errors != "ignore":
-> 5287                 raise KeyError(f"{labels[mask]} not found in axis")
   5288             indexer = indexer[~mask]
   5289         return self.delete(indexer)


KeyError: "[('firebase_exp_4', 'firebase_exp_4_timestamp', 'firebase_exp_2', 'firebase_exp_2_timestamp', 'firebase_exp_3', 'firebase_exp_3_timestamp', 'firebase_last_notification', 'firebase_last_notification_timestamp', 'last_gclid', 'last_gclid_1', '_ltv_USD', '_ltv_USD_timestamp', '_ltv_BRL', '_ltv_BRL_timestamp')] not found in axis"

Figure out why the data is missing

누락된 값을 처리하려면 값이 누락된 이유를 파악하기 위해 직관을 사용해야 합니다. 이것을 알아내는 데 도움이 되도록 스스로에게 물어볼 수 있는 가장 중요한 질문 중 하나는 다음과 같습니다.

이 값이 기록되지 않았거나 존재하지 않아서 누락된 것입니까?

  • 값이 존재하지 않아서 누락된 경우, 값이 무엇인지 추측하는 것은 의미가 없기 때문에 NaN 값으로 처리.
  • 값이 기록되지 않아 누락된 경우, 해당 열과 행의 다른 값을 기반으로 했을 수 있는 값을 추측. (imputation)

Drop missing values

급하거나 값이 누락된 이유를 파악할 이유가 없는 경우, 누락된 값이 포함된 행이나 열을 제거할 수 있다. (참고 : 일반적으로 중요한 프로젝트에는 이 접근 방식을 권장하지 않는다. 일반적으로 데이터를 살펴보고 누락된 값이 있는 모든 열을 하나씩 살펴보고 실제로 데이터 세트를 파악하는 데 시간을 할애하는 것이 좋다.)

누락된 값이 있는 행을 삭제할 경우, dropna().

activity.dropna()
items traffic_source_medium traffic_source_source traffic_source_name event_previous_timestamp ga_session_number ga_session_number_timestamp user_level user_level_timestamp frist_version ... activity_name action_name activity_id action_id firebase_event_origin ga_session_number_1 sub_name user_first_touch_timestamp stream_id event_timestamp

0 rows × 88 columns

모든 데이터가 제거되었다. 이는 데이터 세트의 모든 행에 하나 이상의 결측값이 있기 때문이다. 대신 하나 이상의 결측값이 있는 모든 열을 제거하는 것이 더 나을 수 있습니다.

columns_with_na_dropped = activity.dropna(axis=1)
columns_with_na_dropped.head()
items user_level user_level_timestamp user_pseudo_id event_bundle_sequence_id platform event_server_timestamp_offset app_info_install_source app_info_firebase_app_id app_info_version ... geo_continent sub_id activity_name action_name activity_id action_id firebase_event_origin sub_name stream_id event_timestamp
0 [] 111 1619310355711000 8da97c0b43305c41d78cc0f5585d6b9a 1572 ANDROID 1651059 com.android.vending 1:142569243604:android:31c6a5c81e681fd8210c41 1.5.2 ... Americas 1008 content_button click_skin 3 3003 app firefighter 2205916636 1619310355712000
1 [] 116 1619310716820000 8da97c0b43305c41d78cc0f5585d6b9a 1647 ANDROID 1681114 com.android.vending 1:142569243604:android:31c6a5c81e681fd8210c41 1.5.2 ... Americas 0 content_button close_collection 3 3002 app 0 2205916636 1619310716820000
2 [] 135 1619538999255000 8da97c0b43305c41d78cc0f5585d6b9a 2047 ANDROID 1436640 com.android.vending 1:142569243604:android:31c6a5c81e681fd8210c41 1.5.2 ... Americas 1005 content_button click_skin 3 3003 app doctor 2205916636 1619538999256000
3 [] 110 1619310280808000 8da97c0b43305c41d78cc0f5585d6b9a 1554 ANDROID 1643764 com.android.vending 1:142569243604:android:31c6a5c81e681fd8210c41 1.5.2 ... Americas 0 content_button open_collection 3 3001 app 0 2205916636 1619310280808000
4 [] 135 1619539036765000 8da97c0b43305c41d78cc0f5585d6b9a 2068 ANDROID 1425169 com.android.vending 1:142569243604:android:31c6a5c81e681fd8210c41 1.5.2 ... Americas 1007 content_button click_skin 3 3003 app police 2205916636 1619539036766000

5 rows × 33 columns

print("Columns in original dataset: %d \n" % activity.shape[1])
print("Columns with na's dropped: %d" % columns_with_na_dropped.shape[1])
Columns in original dataset: 88 

Columns with na's dropped: 33
columns_with_na_dropped.columns
Index(['items', 'user_level', 'user_level_timestamp', 'user_pseudo_id',
       'event_bundle_sequence_id', 'platform', 'event_server_timestamp_offset',
       'app_info_install_source', 'app_info_firebase_app_id',
       'app_info_version', 'app_info_id', 'event_date', 'event_name',
       'device_time_zone_offset_seconds', 'device_operating_system',
       'device_mobile_os_hardware_model', 'device_is_limited_ad_tracking',
       'device_operating_system_version', 'device_language', 'device_category',
       'geo_metro', 'geo_sub_continent', 'geo_country', 'geo_continent',
       'sub_id', 'activity_name', 'action_name', 'activity_id', 'action_id',
       'firebase_event_origin', 'sub_name', 'stream_id', 'event_timestamp'],
      dtype='object')
# 전체가 결측값인 행 또는 열만 삭제
activity.dropna(how='all', axis=1)
items traffic_source_medium traffic_source_source traffic_source_name event_previous_timestamp ga_session_number ga_session_number_timestamp user_level user_level_timestamp frist_version ... activity_name action_name activity_id action_id firebase_event_origin ga_session_number_1 sub_name user_first_touch_timestamp stream_id event_timestamp
0 [] notification Firebase 5.63939e+18 1.619310e+15 11.0 1.619309e+15 111 1619310355711000 1.5.2 ... content_button click_skin 3 3003 app 11.0 firefighter 1.619195e+15 2205916636 1619310355712000
1 [] notification Firebase 5.63939e+18 1.619311e+15 11.0 1.619309e+15 116 1619310716820000 1.5.2 ... content_button close_collection 3 3002 app 11.0 0 1.619195e+15 2205916636 1619310716820000
2 [] notification Firebase 5.63939e+18 1.619539e+15 16.0 1.619539e+15 135 1619538999255000 1.5.2 ... content_button click_skin 3 3003 app 16.0 doctor 1.619195e+15 2205916636 1619538999256000
3 [] notification Firebase 5.63939e+18 1.619309e+15 11.0 1.619309e+15 110 1619310280808000 1.5.2 ... content_button open_collection 3 3001 app 11.0 0 1.619195e+15 2205916636 1619310280808000
4 [] notification Firebase 5.63939e+18 1.619539e+15 16.0 1.619539e+15 135 1619539036765000 1.5.2 ... content_button click_skin 3 3003 app 16.0 police 1.619195e+15 2205916636 1619539036766000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1210974 [] notification Firebase 5639386036271637485 1.619539e+15 16.0 1.619539e+15 135 1619539024856000 1.5.2 ... content_button click_skin 3 3003 app 16.0 basic 1.619195e+15 2205916636 1619539024856000
1210975 [] notification Firebase 5639386036271637485 1.619309e+15 11.0 1.619309e+15 107 1619308894406000 1.5.2 ... content_button click_skin 3 3003 app 11.0 worker 1.619195e+15 2205916636 1619308894406000
1210976 [] notification Firebase 5639386036271637485 1.619550e+15 2.0 1.619550e+15 16 1619550368873000 1.5.2 ... content_button click_skin 3 3003 app 2.0 basic 1.619197e+15 2205916636 1619550368873000
1210977 [] notification Firebase 5639386036271637485 1.619550e+15 2.0 1.619550e+15 16 1619550361714000 1.5.2 ... content_button click_skin 3 3003 app 2.0 pringles 1.619197e+15 2205916636 1619550361715000
1210978 [] notification Firebase 5639386036271637485 1.619550e+15 2.0 1.619550e+15 18 1619550449656000 1.5.2 ... content_button open_collection 3 3001 app 2.0 0 1.619197e+15 2205916636 1619550449656000

1210979 rows × 85 columns

activity.dropna(how='all', axis=0)
items traffic_source_medium traffic_source_source traffic_source_name event_previous_timestamp ga_session_number ga_session_number_timestamp user_level user_level_timestamp frist_version ... activity_name action_name activity_id action_id firebase_event_origin ga_session_number_1 sub_name user_first_touch_timestamp stream_id event_timestamp
0 [] notification Firebase 5.63939e+18 1.619310e+15 11.0 1.619309e+15 111 1619310355711000 1.5.2 ... content_button click_skin 3 3003 app 11.0 firefighter 1.619195e+15 2205916636 1619310355712000
1 [] notification Firebase 5.63939e+18 1.619311e+15 11.0 1.619309e+15 116 1619310716820000 1.5.2 ... content_button close_collection 3 3002 app 11.0 0 1.619195e+15 2205916636 1619310716820000
2 [] notification Firebase 5.63939e+18 1.619539e+15 16.0 1.619539e+15 135 1619538999255000 1.5.2 ... content_button click_skin 3 3003 app 16.0 doctor 1.619195e+15 2205916636 1619538999256000
3 [] notification Firebase 5.63939e+18 1.619309e+15 11.0 1.619309e+15 110 1619310280808000 1.5.2 ... content_button open_collection 3 3001 app 11.0 0 1.619195e+15 2205916636 1619310280808000
4 [] notification Firebase 5.63939e+18 1.619539e+15 16.0 1.619539e+15 135 1619539036765000 1.5.2 ... content_button click_skin 3 3003 app 16.0 police 1.619195e+15 2205916636 1619539036766000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1210974 [] notification Firebase 5639386036271637485 1.619539e+15 16.0 1.619539e+15 135 1619539024856000 1.5.2 ... content_button click_skin 3 3003 app 16.0 basic 1.619195e+15 2205916636 1619539024856000
1210975 [] notification Firebase 5639386036271637485 1.619309e+15 11.0 1.619309e+15 107 1619308894406000 1.5.2 ... content_button click_skin 3 3003 app 11.0 worker 1.619195e+15 2205916636 1619308894406000
1210976 [] notification Firebase 5639386036271637485 1.619550e+15 2.0 1.619550e+15 16 1619550368873000 1.5.2 ... content_button click_skin 3 3003 app 2.0 basic 1.619197e+15 2205916636 1619550368873000
1210977 [] notification Firebase 5639386036271637485 1.619550e+15 2.0 1.619550e+15 16 1619550361714000 1.5.2 ... content_button click_skin 3 3003 app 2.0 pringles 1.619197e+15 2205916636 1619550361715000
1210978 [] notification Firebase 5639386036271637485 1.619550e+15 2.0 1.619550e+15 18 1619550449656000 1.5.2 ... content_button open_collection 3 3001 app 2.0 0 1.619197e+15 2205916636 1619550449656000

1210979 rows × 88 columns

thresh : optional 변수로 not-nan data 즉 value 값이 존재하는 nan이 아닌 데이터가 몇 개면 삭제하지 않을 것인지를 정하는 변수이다.

# 임계치 설정해서 제거하기
activity.dropna(thresh=10, axis=1)
items traffic_source_medium traffic_source_source traffic_source_name event_previous_timestamp ga_session_number ga_session_number_timestamp user_level user_level_timestamp frist_version ... activity_name action_name activity_id action_id firebase_event_origin ga_session_number_1 sub_name user_first_touch_timestamp stream_id event_timestamp
0 [] notification Firebase 5.63939e+18 1.619310e+15 11.0 1.619309e+15 111 1619310355711000 1.5.2 ... content_button click_skin 3 3003 app 11.0 firefighter 1.619195e+15 2205916636 1619310355712000
1 [] notification Firebase 5.63939e+18 1.619311e+15 11.0 1.619309e+15 116 1619310716820000 1.5.2 ... content_button close_collection 3 3002 app 11.0 0 1.619195e+15 2205916636 1619310716820000
2 [] notification Firebase 5.63939e+18 1.619539e+15 16.0 1.619539e+15 135 1619538999255000 1.5.2 ... content_button click_skin 3 3003 app 16.0 doctor 1.619195e+15 2205916636 1619538999256000
3 [] notification Firebase 5.63939e+18 1.619309e+15 11.0 1.619309e+15 110 1619310280808000 1.5.2 ... content_button open_collection 3 3001 app 11.0 0 1.619195e+15 2205916636 1619310280808000
4 [] notification Firebase 5.63939e+18 1.619539e+15 16.0 1.619539e+15 135 1619539036765000 1.5.2 ... content_button click_skin 3 3003 app 16.0 police 1.619195e+15 2205916636 1619539036766000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1210974 [] notification Firebase 5639386036271637485 1.619539e+15 16.0 1.619539e+15 135 1619539024856000 1.5.2 ... content_button click_skin 3 3003 app 16.0 basic 1.619195e+15 2205916636 1619539024856000
1210975 [] notification Firebase 5639386036271637485 1.619309e+15 11.0 1.619309e+15 107 1619308894406000 1.5.2 ... content_button click_skin 3 3003 app 11.0 worker 1.619195e+15 2205916636 1619308894406000
1210976 [] notification Firebase 5639386036271637485 1.619550e+15 2.0 1.619550e+15 16 1619550368873000 1.5.2 ... content_button click_skin 3 3003 app 2.0 basic 1.619197e+15 2205916636 1619550368873000
1210977 [] notification Firebase 5639386036271637485 1.619550e+15 2.0 1.619550e+15 16 1619550361714000 1.5.2 ... content_button click_skin 3 3003 app 2.0 pringles 1.619197e+15 2205916636 1619550361715000
1210978 [] notification Firebase 5639386036271637485 1.619550e+15 2.0 1.619550e+15 18 1619550449656000 1.5.2 ... content_button open_collection 3 3001 app 2.0 0 1.619197e+15 2205916636 1619550449656000

1210979 rows × 85 columns

# 특정 컬럼 내의 결측치만 삭제하려고 할 때
activity.dropna(subset=['items', 'ga_session_number'])
items traffic_source_medium traffic_source_source traffic_source_name event_previous_timestamp ga_session_number ga_session_number_timestamp user_level user_level_timestamp frist_version ... activity_name action_name activity_id action_id firebase_event_origin ga_session_number_1 sub_name user_first_touch_timestamp stream_id event_timestamp
0 [] notification Firebase 5.63939e+18 1.619310e+15 11.0 1.619309e+15 111 1619310355711000 1.5.2 ... content_button click_skin 3 3003 app 11.0 firefighter 1.619195e+15 2205916636 1619310355712000
1 [] notification Firebase 5.63939e+18 1.619311e+15 11.0 1.619309e+15 116 1619310716820000 1.5.2 ... content_button close_collection 3 3002 app 11.0 0 1.619195e+15 2205916636 1619310716820000
2 [] notification Firebase 5.63939e+18 1.619539e+15 16.0 1.619539e+15 135 1619538999255000 1.5.2 ... content_button click_skin 3 3003 app 16.0 doctor 1.619195e+15 2205916636 1619538999256000
3 [] notification Firebase 5.63939e+18 1.619309e+15 11.0 1.619309e+15 110 1619310280808000 1.5.2 ... content_button open_collection 3 3001 app 11.0 0 1.619195e+15 2205916636 1619310280808000
4 [] notification Firebase 5.63939e+18 1.619539e+15 16.0 1.619539e+15 135 1619539036765000 1.5.2 ... content_button click_skin 3 3003 app 16.0 police 1.619195e+15 2205916636 1619539036766000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1210974 [] notification Firebase 5639386036271637485 1.619539e+15 16.0 1.619539e+15 135 1619539024856000 1.5.2 ... content_button click_skin 3 3003 app 16.0 basic 1.619195e+15 2205916636 1619539024856000
1210975 [] notification Firebase 5639386036271637485 1.619309e+15 11.0 1.619309e+15 107 1619308894406000 1.5.2 ... content_button click_skin 3 3003 app 11.0 worker 1.619195e+15 2205916636 1619308894406000
1210976 [] notification Firebase 5639386036271637485 1.619550e+15 2.0 1.619550e+15 16 1619550368873000 1.5.2 ... content_button click_skin 3 3003 app 2.0 basic 1.619197e+15 2205916636 1619550368873000
1210977 [] notification Firebase 5639386036271637485 1.619550e+15 2.0 1.619550e+15 16 1619550361714000 1.5.2 ... content_button click_skin 3 3003 app 2.0 pringles 1.619197e+15 2205916636 1619550361715000
1210978 [] notification Firebase 5639386036271637485 1.619550e+15 2.0 1.619550e+15 18 1619550449656000 1.5.2 ... content_button open_collection 3 3001 app 2.0 0 1.619197e+15 2205916636 1619550449656000

1210772 rows × 88 columns

Filling in missing values automatically


Handling missing data

def handling_missing_data(df):
    # 데이터 세트의 값 중 몇 퍼센트가 누락되었는지 확인
    print("how many total missing values do we have?")
    total_cells = np.product(df.shape)
    total_missing = missing_values_count.sum()
    print(round((total_missing/total_cells) * 100, 2), "%")
    
    #
    columns_with_na_dropped = activity.dropna(axis=1)
    columns_with_na_dropped.head()
    

Scaling and normalization

from scipy import stats
from mlxtend.preprocessing import minmax_scaling

import seaborn as sns
import matplotlib.pyplot as plt

# set seed for reproducibility
np.random.seed(0)
cat_cols = activity.select_dtypes(exclude=['int64','float64']).columns
cat_cols
Index(['items', 'traffic_source_medium', 'traffic_source_source',
       'traffic_source_name', 'frist_version', 'cuid', 'ad_id', 'last_gclid_1',
       'user_pseudo_id', 'platform', 'app_info_install_source',
       'app_info_firebase_app_id', 'app_info_version', 'app_info_id',
       'event_name', 'device_operating_system',
       'device_mobile_os_hardware_model', 'device_mobile_marketing_name',
       'device_is_limited_ad_tracking', 'device_advertising_id',
       'device_operating_system_version', 'device_mobile_model_name',
       'device_language', 'device_mobile_brand_name', 'device_category',
       'geo_metro', 'geo_sub_continent', 'geo_city', 'geo_region',
       'geo_country', 'geo_continent', 'ad_id_1', 'firebase_screen_class',
       'activity_name', 'action_name', 'firebase_event_origin', 'sub_name'],
      dtype='object')
len(cat_cols)
37
num_colms = activity._get_numeric_data()
num_colms.head()
event_previous_timestamp ga_session_number ga_session_number_timestamp user_level user_level_timestamp frist_version_timestamp cuid_timestamp firebase_exp_8 firebase_exp_8_timestamp firebase_exp_10 ... ga_session_id_1 firebase_screen_id sub_id engaged_session_event activity_id action_id ga_session_number_1 user_first_touch_timestamp stream_id event_timestamp
0 1.619310e+15 11.0 1.619309e+15 111 1619310355711000 1.619310e+15 1.619310e+15 1.0 1.619195e+15 0.0 ... 1.619309e+09 -8.255892e+18 1008 1.0 3 3003 11.0 1.619195e+15 2205916636 1619310355712000
1 1.619311e+15 11.0 1.619309e+15 116 1619310716820000 1.619310e+15 1.619310e+15 1.0 1.619195e+15 0.0 ... 1.619309e+09 -8.255892e+18 0 1.0 3 3002 11.0 1.619195e+15 2205916636 1619310716820000
2 1.619539e+15 16.0 1.619539e+15 135 1619538999255000 1.619539e+15 1.619539e+15 1.0 1.619195e+15 0.0 ... 1.619539e+09 2.995595e+18 1005 1.0 3 3003 16.0 1.619195e+15 2205916636 1619538999256000
3 1.619309e+15 11.0 1.619309e+15 110 1619310280808000 1.619310e+15 1.619310e+15 1.0 1.619195e+15 0.0 ... 1.619309e+09 -8.255892e+18 0 1.0 3 3001 11.0 1.619195e+15 2205916636 1619310280808000
4 1.619539e+15 16.0 1.619539e+15 135 1619539036765000 1.619539e+15 1.619539e+15 1.0 1.619195e+15 0.0 ... 1.619539e+09 2.995595e+18 1007 1.0 3 3003 16.0 1.619195e+15 2205916636 1619539036766000

5 rows × 52 columns

activity.nunique()
items                               1
traffic_source_medium               5
traffic_source_source               5
traffic_source_name                 7
event_previous_timestamp      1042383
                               ...   
ga_session_number_1                44
sub_name                           10
user_first_touch_timestamp     160848
stream_id                           1
event_timestamp               1202693
Length: 88, dtype: int64
.drop_duplicates()

업데이트:

댓글남기기