In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
import warnings
warnings.filterwarnings('ignore')
In [2]:
df = pd.read_csv('data_telco.csv')
In [3]:
df.head()
Out[3]:
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | … | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7590-VHVEG | NaN | NaN | NaN | NaN | 1 | No | NaN | NaN | NaN | … | NaN | NaN | NaN | NaN | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
1 | 5575-GNVDE | NaN | NaN | NaN | NaN | 34 | Yes | NaN | NaN | NaN | … | NaN | NaN | NaN | NaN | One year | No | Mailed check | 56.95 | 1889.5 | No |
2 | 3668-QPYBK | NaN | NaN | NaN | NaN | 2 | Yes | NaN | NaN | NaN | … | NaN | NaN | NaN | NaN | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
3 | 7795-CFOCW | NaN | NaN | NaN | NaN | 45 | No | NaN | NaN | NaN | … | NaN | NaN | NaN | NaN | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
4 | 9237-HQITU | NaN | NaN | NaN | NaN | 2 | Yes | NaN | NaN | NaN | … | NaN | NaN | NaN | NaN | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
5 rows × 21 columns
In [4]:
df.info()
RangeIndex: 7043 entries, 0 to 7042 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerID 7043 non-null object 1 gender 6034 non-null object 2 SeniorCitizen 6034 non-null float64 3 Partner 6034 non-null object 4 Dependents 6034 non-null object 5 tenure 7043 non-null int64 6 PhoneService 7043 non-null object 7 MultipleLines 6034 non-null object 8 InternetService 6034 non-null object 9 OnlineSecurity 6034 non-null object 10 OnlineBackup 6034 non-null object 11 DeviceProtection 6034 non-null object 12 TechSupport 6034 non-null object 13 StreamingTV 6034 non-null object 14 StreamingMovies 6034 non-null object 15 Contract 7043 non-null object 16 PaperlessBilling 7043 non-null object 17 PaymentMethod 7043 non-null object 18 MonthlyCharges 7043 non-null float64 19 TotalCharges 7043 non-null object 20 Churn 7043 non-null object dtypes: float64(2), int64(1), object(18) memory usage: 1.1+ MB
In [5]:
df.isna().sum()
Out[5]:
customerID 0 gender 1009 SeniorCitizen 1009 Partner 1009 Dependents 1009 tenure 0 PhoneService 0 MultipleLines 1009 InternetService 1009 OnlineSecurity 1009 OnlineBackup 1009 DeviceProtection 1009 TechSupport 1009 StreamingTV 1009 StreamingMovies 1009 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 0 Churn 0 dtype: int64
In [6]:
# handling missing value
In [7]:
df['InternetService'] = df.InternetService.fillna("unknown")
In [8]:
df['InternetService'].value_counts()
Out[8]:
Fiber optic 2621 DSL 2092 No 1321 unknown 1009 Name: InternetService, dtype: int64
In [9]:
# change data type
In [10]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors="coerce")
In [11]:
df.isna().sum()
Out[11]:
customerID 0 gender 1009 SeniorCitizen 1009 Partner 1009 Dependents 1009 tenure 0 PhoneService 0 MultipleLines 1009 InternetService 0 OnlineSecurity 1009 OnlineBackup 1009 DeviceProtection 1009 TechSupport 1009 StreamingTV 1009 StreamingMovies 1009 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 11 Churn 0 dtype: int64
In [12]:
val = df['TotalCharges'].median()
df['TotalCharges'] = df['TotalCharges'].fillna(val)
In [13]:
df.info()
RangeIndex: 7043 entries, 0 to 7042 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerID 7043 non-null object 1 gender 6034 non-null object 2 SeniorCitizen 6034 non-null float64 3 Partner 6034 non-null object 4 Dependents 6034 non-null object 5 tenure 7043 non-null int64 6 PhoneService 7043 non-null object 7 MultipleLines 6034 non-null object 8 InternetService 7043 non-null object 9 OnlineSecurity 6034 non-null object 10 OnlineBackup 6034 non-null object 11 DeviceProtection 6034 non-null object 12 TechSupport 6034 non-null object 13 StreamingTV 6034 non-null object 14 StreamingMovies 6034 non-null object 15 Contract 7043 non-null object 16 PaperlessBilling 7043 non-null object 17 PaymentMethod 7043 non-null object 18 MonthlyCharges 7043 non-null float64 19 TotalCharges 7043 non-null float64 20 Churn 7043 non-null object dtypes: float64(3), int64(1), object(17) memory usage: 1.1+ MB
In [14]:
# Describe
In [15]:
df.describe()
Out[15]:
SeniorCitizen | tenure | MonthlyCharges | TotalCharges | |
---|---|---|---|---|
count | 6034.000000 | 7043.000000 | 7043.000000 | 7043.000000 |
mean | 0.161916 | 32.371149 | 64.761692 | 2281.916928 |
std | 0.368404 | 24.559481 | 30.090047 | 2265.270398 |
min | 0.000000 | 0.000000 | 18.250000 | 18.800000 |
25% | 0.000000 | 9.000000 | 35.500000 | 402.225000 |
50% | 0.000000 | 29.000000 | 70.350000 | 1397.475000 |
75% | 0.000000 | 55.000000 | 89.850000 | 3786.600000 |
max | 1.000000 | 72.000000 | 118.750000 | 8684.800000 |
In [16]:
df['InternetService'].value_counts()
Out[16]:
Fiber optic 2621 DSL 2092 No 1321 unknown 1009 Name: InternetService, dtype: int64
In [17]:
df['InternetService'].value_counts().plot(kind='bar');
plt.title('Internet Service');
Copyright © Code Fetcher 2022