In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:

df = pd.read_csv('data_telco.csv')

In [3]:

df.head()

Out[3]:

	customerID	gender	SeniorCitizen	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	…	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	7590-VHVEG	NaN	NaN	NaN	NaN	1	No	NaN	NaN	NaN	…	NaN	NaN	NaN	NaN	Month-to-month	Yes	Electronic check	29.85	29.85	No
1	5575-GNVDE	NaN	NaN	NaN	NaN	34	Yes	NaN	NaN	NaN	…	NaN	NaN	NaN	NaN	One year	No	Mailed check	56.95	1889.5	No
2	3668-QPYBK	NaN	NaN	NaN	NaN	2	Yes	NaN	NaN	NaN	…	NaN	NaN	NaN	NaN	Month-to-month	Yes	Mailed check	53.85	108.15	Yes
3	7795-CFOCW	NaN	NaN	NaN	NaN	45	No	NaN	NaN	NaN	…	NaN	NaN	NaN	NaN	One year	No	Bank transfer (automatic)	42.30	1840.75	No
4	9237-HQITU	NaN	NaN	NaN	NaN	2	Yes	NaN	NaN	NaN	…	NaN	NaN	NaN	NaN	Month-to-month	Yes	Electronic check	70.70	151.65	Yes

5 rows × 21 columns

In [4]:

df.info()


RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            6034 non-null   object 
 2   SeniorCitizen     6034 non-null   float64
 3   Partner           6034 non-null   object 
 4   Dependents        6034 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     6034 non-null   object 
 8   InternetService   6034 non-null   object 
 9   OnlineSecurity    6034 non-null   object 
 10  OnlineBackup      6034 non-null   object 
 11  DeviceProtection  6034 non-null   object 
 12  TechSupport       6034 non-null   object 
 13  StreamingTV       6034 non-null   object 
 14  StreamingMovies   6034 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7043 non-null   object 
 20  Churn             7043 non-null   object 
dtypes: float64(2), int64(1), object(18)
memory usage: 1.1+ MB

In [5]:

df.isna().sum()

Out[5]:

customerID             0
gender              1009
SeniorCitizen       1009
Partner             1009
Dependents          1009
tenure                 0
PhoneService           0
MultipleLines       1009
InternetService     1009
OnlineSecurity      1009
OnlineBackup        1009
DeviceProtection    1009
TechSupport         1009
StreamingTV         1009
StreamingMovies     1009
Contract               0
PaperlessBilling       0
PaymentMethod          0
MonthlyCharges         0
TotalCharges           0
Churn                  0
dtype: int64

In [6]:

# handling missing value

In [7]:

df['InternetService'] = df.InternetService.fillna("unknown")

In [8]:

df['InternetService'].value_counts()

Out[8]:

Fiber optic    2621
DSL            2092
No             1321
unknown        1009
Name: InternetService, dtype: int64

In [9]:

# change data type

In [10]:

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors="coerce")

In [11]:

df.isna().sum()

Out[11]:

customerID             0
gender              1009
SeniorCitizen       1009
Partner             1009
Dependents          1009
tenure                 0
PhoneService           0
MultipleLines       1009
InternetService        0
OnlineSecurity      1009
OnlineBackup        1009
DeviceProtection    1009
TechSupport         1009
StreamingTV         1009
StreamingMovies     1009
Contract               0
PaperlessBilling       0
PaymentMethod          0
MonthlyCharges         0
TotalCharges          11
Churn                  0
dtype: int64

In [12]:

val = df['TotalCharges'].median()
df['TotalCharges'] = df['TotalCharges'].fillna(val)

In [13]:

df.info()


RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            6034 non-null   object 
 2   SeniorCitizen     6034 non-null   float64
 3   Partner           6034 non-null   object 
 4   Dependents        6034 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     6034 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    6034 non-null   object 
 10  OnlineBackup      6034 non-null   object 
 11  DeviceProtection  6034 non-null   object 
 12  TechSupport       6034 non-null   object 
 13  StreamingTV       6034 non-null   object 
 14  StreamingMovies   6034 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7043 non-null   float64
 20  Churn             7043 non-null   object 
dtypes: float64(3), int64(1), object(17)
memory usage: 1.1+ MB

In [14]:

# Describe

In [15]:

df.describe()

Out[15]:

	SeniorCitizen	tenure	MonthlyCharges	TotalCharges
count	6034.000000	7043.000000	7043.000000	7043.000000
mean	0.161916	32.371149	64.761692	2281.916928
std	0.368404	24.559481	30.090047	2265.270398
min	0.000000	0.000000	18.250000	18.800000
25%	0.000000	9.000000	35.500000	402.225000
50%	0.000000	29.000000	70.350000	1397.475000
75%	0.000000	55.000000	89.850000	3786.600000
max	1.000000	72.000000	118.750000	8684.800000

In [16]:

df['InternetService'].value_counts()

Out[16]:

Fiber optic    2621
DSL            2092
No             1321
unknown        1009
Name: InternetService, dtype: int64

In [17]:

df['InternetService'].value_counts().plot(kind='bar');
plt.title('Internet Service');

Data Profiling

Similar Notebooks

Leave a comment Cancel reply