Data Analysis For Machine Learning with Python (part-3)

Data Analysis For Machine Learning with Python (part-3)

21/10/19   30 minutes read     200 Naren Allam

In this article, you will learn practically how to get clean data set after data_cleaning or data pre_processing for analysis, using python packages, creating meaningful data visualizations much more..!

follow data analysis articles :
Data Analysis For Machine Learning with Python (part-1)
Data Analysis For Machine Learning with Python (part-2)

Data Cleaning or Data pre_processing

Data cleaning (or) Data pre_processing is Improving the quality of data by removing errors,handling missing values,Droping Unnecessary Columns,Create new variables,Transform data,Rename Variables,Merge two datasets and resolving inconsistencies etc...!

in previous article (part 2) we discussed NA ,missing values and duplicate values how to handle those data etc...
Data Cleaning (or) Data Pre_processing packages in Python :Pandas,NumPy.
continuation of the artcle Data Analysis For Machine Learning with Python (part-2).

PYTHON  Copy
                    
                      #python packages for data analysis
import pandas as pd
import numpy as np
import pandas_profiling as pp

# in this data_set handled na,missing values and dupilcates vaules in article part(2)
df1 = pd.read_csv('mobiles_data.csv') # loading data set.
display(df.shape, df.head())

df1.profile_report() 
                    
                  

mobile names displaying to know how many company mobiles we have in this dataset.

PYTHON  Copy
                    
                      display(len(df1.mobiles.unique()), df1.mobiles.unique())
                    
                  

in the above image, we have 148 mobile names.

Next process we are going clean dataset, like removing the unwanted string from each columns etc...

PYTHON  Copy
                    
                      # mobile names
mobile_brands = ['samsung', 'oppo', 'redmi', 'asus', 'lenovo', 'micromax', 'moto', 'vivo', 'honor', 'mi', 'panasonic', 'gionee', 'nokia', 'lava', 'apple', 'google']
for i in range(len(mobile_brands)):
    df2.loc[df2.mobiles.str.contains(mobile_brands[i]), 'mobiles'] = mobile_brands[i]
df2.loc[~df2['mobiles'].isin(mobile_brands), 'mobiles'] = 'others'
display(len(df2.mobiles.unique()), df2.mobiles.unique())

display(len(df2.mobiles.unique()), df2.mobiles.unique())


                    
                  
PYTHON  Copy
                    
                      #color
display(len(df2.colors.unique()), df2.colors.unique())
colors_list = ['black', 'gold', 'white', 'others']
for i in range(len(colors_list)):
    df2.loc[df2.colors.str.contains(colors_list[i]), 'colors'] = colors_list[i]
df2.loc[~df2['colors'].isin(colors_list), 'colors'] = 'others'
display(len(df2.colors.unique()), df2.colors.unique())
                    
                  
PYTHON  Copy
                    
                      #RAM, ROM and expandable memory

display(len(df2.ram.unique()), df2.ram.unique())
df2['ram'] = df2['ram'].str.split(" ").str[0]
display(len(df2.ram.unique()), df2.ram.unique())
display(len(df2.rom.unique()), df2.rom.unique())
display(len(df2.rom.unique()), df2.rom.unique())
df2['rom'] = df2['rom'].str.split("g").str[0]
display(len(df2['expandable_gb/tb'].unique()), df2['expandable_gb/tb'].unique())
df2['expandable_gb/tb'] = df2['expandable_gb/tb'].str.split("g").str[0]
df2['expandable_gb/tb'] = df2['expandable_gb/tb'].str.split("t").str[0]
display(len(df2['expandable_gb/tb'].unique()), df2['expandable_gb/tb'].unique())


df2.head(2)
                    
                  
PYTHON  Copy
                    
                      #Dual SIM
display(len(df2['dual_sim'].unique()), df2['dual_sim'].unique())
df2.drop(['dual_sim'], axis = 1, inplace = True) #dropping this variable is its variance is 0
df2.head(2)
                    
                  
PYTHON  Copy
                    
                      #screen_in_inch
display(len(df2['screen_in_inch'].unique()), df2['screen_in_inch'].unique())
df2.loc[:, 'screen_in_inch'] = df2.loc[:, 'screen_in_inch'].astype('float')
display(len(df2['screen_in_inch'].unique()), df2['screen_in_inch'].unique())


                    
                  
PYTHON  Copy
                    
                      #display
df2.head(2)
display(len(df2['display'].unique()), df2['display'].unique())
df2.loc[df2.display.str.contains('nadisplay'), ]
df2 = df2.loc[~df2.display.str.contains('nadisplay'), ] #removing na display row
display(len(df2['display'].unique()), df2['display'].unique())

# tentative display list - display_list = ['hd', 'fhd', 'fwvga', 'quadhd', 'quarterhd', 'retina']
df2.loc[df2['display'].isin(['hd', 'hddisplay']), 'display'] = 'hd'
df2.loc[df2['display'].isin(['fullhddisplay', 'fhd', 'fullhd']), 'display'] = 'fhd'
df2.loc[df2['display'].isin(['fwvgadisplay', 'fwvga']), 'display'] = 'fwvga'
df2.loc[df2['display'].isin(['quadhddisplay', 'quadhd']), 'display'] = 'quadhd'
df2.loc[df2['display'].isin(['quarterhddisplay']), 'display'] = 'quarterhd'
df2.loc[df2['display'].isin(['retinahddisplay']), 'display'] = 'retinahd'

display(len(df2['display'].unique()), df2['display'].unique())
df2.head(2)
                    
                  
PYTHON  Copy
                    
                      #Secondary and primary camera

display(len(df2['secondary_camera(mp)'].unique()), df2['secondary_camera(mp)'].unique())
df2.loc[:, 'secondary_camera(mp)'] = df2.loc[:, 'secondary_camera(mp)'].astype('int')
display(len(df2['secondary_camera(mp)'].unique()), df2['secondary_camera(mp)'].unique())


                    
                  
PYTHON  Copy
                    
                      # Primary camera
display(len(df2['primary_camera(mp)'].unique()), df2['primary_camera(mp)'].unique())
df1.loc[df1['primary_camera(mp)'] == '32 + 5 + 8 ', ]

                    
                  
PYTHON  Copy
                    
                      # Creating new variable of maximum pixels of the phone
import re

# Function to derive pixels
def pixels(ip_string):
    try:
        out = float(re.findall('\d*\.\d+|\d+', ip_string.strip().replace(" ", ""))[0])
    except:
        out = 0
#     print("pixels " + str(out))
    return out

# Function to derive maximum pixels
def max_pixels(string):
    a = string.split("+") 
#     print(a)# Creating new variable of maximum pixels of the phone
import re

# Function to derive pixels
def pixels(ip_string):
    try:
        out = float(re.findall('\d*\.\d+|\d+', ip_string.strip().replace(" ", ""))[0])
    except:
        out = 0
#     print("pixels " + str(out))
    return out

# Function to derive maximum pixels
def max_pixels(string):
    a = string.split("+") 
#     print(a)
    out = max([pixels(a[i]) for i in range(len(a))])    
#     print(out)
    return(out)
    out = max([pixels(a[i]) for i in range(len(a))])    
#     print(out)
    return(out)

df2['primary_camera_max_pixels'] = df2['primary_camera(mp)'].apply(lambda x:max_pixels(x))
display(df2.shape, df2.head(2))

# Creating new variable for number of cameras in a phone
df2['num_of_cameras'] = df2['primary_camera(mp)'].apply(lambda x: x.count('+')) + 1

display(df2.shape, df2.head(2))

#  Dropping the original variable
df2.drop(['primary_camera(mp)'], axis = 1, inplace = True) #dropping the original variable

display(df2.shape, df2.head(2))


                    
                  
PYTHON  Copy
                    
                      #battery details
display(len(df2['battery_details'].unique()), df2['battery_details'].unique())
df2['battery_details'] = df2['battery_details'].astype('int')
display(len(df2['battery_details'].unique()), df2['battery_details'].unique())

                    
                  
PYTHON  Copy
                    
                      #Warranty

display(len(df2['warranty(year)'].unique()), df2['warranty(year)'].unique()) # looks fine

                    
                  
PYTHON  Copy
                    
                      #Type casting and One Hot Encoding
df3 = df2
df3.head(2)
df3.info()

df3['price in rupees'] = df3['price in rupees'].astype('float') 
df3['off(%)'] = df3['off(%)'].astype('float') 
df3['ram'] = df3['ram'].astype('float') 
df3['rom'] = df3['rom'].astype('float') 
df3['expandable_gb/tb'] = df3['expandable_gb/tb'].astype('float') 
df3['warranty(year)'] = df3['warranty(year)'].astype('int') 
df3['rating'] = df3['rating'].astype('float') 

df3.info()

display(df3.shape, df3.head(2))
                    
                  
PYTHON  Copy
                    
                      # One hot encoding of categorical variables
cat_cols = df3.select_dtypes(include=['object']).columns
print(cat_cols)
df4 = pd.get_dummies(df3, columns = cat_cols, drop_first = True)

display(df4.shape, df4.head(2))
display(len(df2['mobiles'].unique()), df2['mobiles'].unique(),
       len(df2['colors'].unique()), df2['colors'].unique(),
       len(df2['display'].unique()), df2['display'].unique()) 
15-3+14+3+5
                    
                  
PYTHON  Copy
                    
                      #Converting rating into categorical variable
len(df5['rating'].unique()), min(df5['rating']) ,max(df5['rating']) ,df5['rating'].unique()
break_point = df5.rating.median()
df5[df5.rating >= break_point].shape, df5[df5.rating < break_point].shape

df5['target_rating'] = (df5.rating >= break_point).astype('int')
# df5[df5.rating >= break_point].rating = 1

df5.drop('rating', axis=1, inplace=True)
df5.columns
                    
                  
PYTHON  Copy
                    
                      #Modelling begins
# Looking at correlation with DV
df5.corr()['target_rating'].sort_values(ascending=False)

import matplotlib.pyplot as plt
import seaborn as sns
fig, ax = plt.subplots()
fig.set_size_inches(14, 10)

ax=sns.heatmap(pd.DataFrame(df5.corr()['target_rating'].sort_values(ascending = False)), cmap="YlGnBu")

# Feature selection
varselect = list(df5.corr()['target_rating'][abs(df5.corr()['target_rating'])>0.15].index)

df_final = df5[varselect]
display(df_final.head(2), df5.shape, df_final.shape )
                    
                  

in this article, we did data_cleaning or data pre_processing successfully..!
Building a model on this mobile data_set Data Analysis For Machine Learning with Python (part-4)