Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

Working With Types

Casting Types & Missing Values

import pandas as pd
houses = pd.read_csv("data/kc_house_data.csv")
titanic = pd.read_csv("data/titanic.csv")
netflix = pd.read_csv("data/netflix_titles.csv", sep="|", index_col=0)
btc = pd.read_csv("data/coin_Bitcoin.csv")
countries = pd.read_csv("data/world-happiness-report-2021.csv")

Casting With astype()

titanic.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pclass     1309 non-null   int64 
 1   survived   1309 non-null   int64 
 2   name       1309 non-null   object
 3   sex        1309 non-null   object
 4   age        1309 non-null   object
 5   sibsp      1309 non-null   int64 
 6   parch      1309 non-null   int64 
 7   ticket     1309 non-null   object
 8   fare       1309 non-null   object
 9   cabin      1309 non-null   object
 10  embarked   1309 non-null   object
 11  boat       1309 non-null   object
 12  body       1309 non-null   object
 13  home.dest  1309 non-null   object
dtypes: int64(4), object(10)
memory usage: 143.3+ KB
titanic["age"].value_counts()
age ? 263 24 47 22 43 21 41 30 40 ... 60.5 1 74 1 0.4167 1 11.5 1 26.5 1 Name: count, Length: 99, dtype: int64
# This gives us an error!!
# titanic["age"].astype("float")
titanic["age"] = titanic["age"].replace(['?'], [None])
titanic.age.value_counts(dropna=False)
age None 263 24 47 22 43 21 41 30 40 ... 60.5 1 74 1 0.4167 1 11.5 1 26.5 1 Name: count, Length: 99, dtype: int64
titanic["age"].astype("float")
0 29.0000 1 0.9167 2 2.0000 3 30.0000 4 25.0000 ... 1304 14.5000 1305 NaN 1306 26.5000 1307 27.0000 1308 29.0000 Name: age, Length: 1309, dtype: float64
titanic.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pclass     1309 non-null   int64 
 1   survived   1309 non-null   int64 
 2   name       1309 non-null   object
 3   sex        1309 non-null   object
 4   age        1046 non-null   object
 5   sibsp      1309 non-null   int64 
 6   parch      1309 non-null   int64 
 7   ticket     1309 non-null   object
 8   fare       1309 non-null   object
 9   cabin      1309 non-null   object
 10  embarked   1309 non-null   object
 11  boat       1309 non-null   object
 12  body       1309 non-null   object
 13  home.dest  1309 non-null   object
dtypes: int64(4), object(10)
memory usage: 143.3+ KB
titanic["age_float"] = titanic["age"].astype("float")
titanic
Loading...
titanic.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   object 
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1309 non-null   object 
 9   cabin      1309 non-null   object 
 10  embarked   1309 non-null   object 
 11  boat       1309 non-null   object 
 12  body       1309 non-null   object 
 13  home.dest  1309 non-null   object 
 14  age_float  1046 non-null   float64
dtypes: float64(1), int64(4), object(10)
memory usage: 153.5+ KB
titanic["age"] = titanic["age"].astype("float")
titanic.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1309 non-null   object 
 9   cabin      1309 non-null   object 
 10  embarked   1309 non-null   object 
 11  boat       1309 non-null   object 
 12  body       1309 non-null   object 
 13  home.dest  1309 non-null   object 
 14  age_float  1046 non-null   float64
dtypes: float64(2), int64(4), object(9)
memory usage: 153.5+ KB
titanic["age"].mean()
np.float64(29.8811345124283)
titanic["sex"].astype("category")
0 female 1 male 2 female 3 male 4 female ... 1304 female 1305 female 1306 male 1307 male 1308 male Name: sex, Length: 1309, dtype: category Categories (2, object): ['female', 'male']
titanic["sex"] = titanic["sex"].astype("category")
titanic.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   int64   
 1   survived   1309 non-null   int64   
 2   name       1309 non-null   object  
 3   sex        1309 non-null   category
 4   age        1046 non-null   float64 
 5   sibsp      1309 non-null   int64   
 6   parch      1309 non-null   int64   
 7   ticket     1309 non-null   object  
 8   fare       1309 non-null   object  
 9   cabin      1309 non-null   object  
 10  embarked   1309 non-null   object  
 11  boat       1309 non-null   object  
 12  body       1309 non-null   object  
 13  home.dest  1309 non-null   object  
 14  age_float  1046 non-null   float64 
dtypes: category(1), float64(2), int64(4), object(8)
memory usage: 144.7+ KB
titanic["sex"] = "MALE"
titanic.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1309 non-null   object 
 9   cabin      1309 non-null   object 
 10  embarked   1309 non-null   object 
 11  boat       1309 non-null   object 
 12  body       1309 non-null   object 
 13  home.dest  1309 non-null   object 
 14  age_float  1046 non-null   float64
dtypes: float64(2), int64(4), object(9)
memory usage: 153.5+ KB
titanic["embarked"] = titanic["embarked"].astype('category')
titanic.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   int64   
 1   survived   1309 non-null   int64   
 2   name       1309 non-null   object  
 3   sex        1309 non-null   object  
 4   age        1046 non-null   float64 
 5   sibsp      1309 non-null   int64   
 6   parch      1309 non-null   int64   
 7   ticket     1309 non-null   object  
 8   fare       1309 non-null   object  
 9   cabin      1309 non-null   object  
 10  embarked   1309 non-null   category
 11  boat       1309 non-null   object  
 12  body       1309 non-null   object  
 13  home.dest  1309 non-null   object  
 14  age_float  1046 non-null   float64 
dtypes: category(1), float64(2), int64(4), object(8)
memory usage: 144.8+ KB

Casting with pd.to_numeric()

titanic = pd.read_csv("data/titanic.csv")
titanic["age"].value_counts()
age ? 263 24 47 22 43 21 41 30 40 ... 60.5 1 74 1 0.4167 1 11.5 1 26.5 1 Name: count, Length: 99, dtype: int64
pd.to_numeric(titanic["age"], errors="coerce")
0 29.0000 1 0.9167 2 2.0000 3 30.0000 4 25.0000 ... 1304 14.5000 1305 NaN 1306 26.5000 1307 27.0000 1308 29.0000 Name: age, Length: 1309, dtype: float64
titanic["age"] = pd.to_numeric(titanic["age"], errors="coerce")
titanic.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1309 non-null   object 
 9   cabin      1309 non-null   object 
 10  embarked   1309 non-null   object 
 11  boat       1309 non-null   object 
 12  body       1309 non-null   object 
 13  home.dest  1309 non-null   object 
dtypes: float64(1), int64(4), object(9)
memory usage: 143.3+ KB
titanic["age"].describe()
count 1046.000000 mean 29.881135 std 14.413500 min 0.166700 25% 21.000000 50% 28.000000 75% 39.000000 max 80.000000 Name: age, dtype: float64
titanic
Loading...

isna() and dropna()

stats = pd.read_csv("data/game_stats.csv")
stats
Loading...
stats.isna()
Loading...
stats[stats["league"].isna()]
Loading...
stats["assists"].dropna()
0 5.0 5 8.0 Name: assists, dtype: float64
assists = stats["assists"]
assists.dropna(inplace=True)
assists
0 5.0 5 8.0 Name: assists, dtype: float64
stats
Loading...
stats.dropna()
Loading...
stats
Loading...
stats = pd.read_csv("data/game_stats.csv")
stats
Loading...
stats.dropna(how="all")
Loading...
stats.dropna(subset=["league","points"])
Loading...
stats.dropna(axis=1)
Loading...

Filling NA values with fillna()

stats
Loading...
stats.fillna(0)
Loading...
stats["league"] = stats["league"].fillna("amateur")
stats
Loading...
stats.fillna({"points": 0, "assists": "NONE"})
Loading...
sales = pd.read_csv("data/sales.csv")
sales
Loading...
sales["shipping_zip"] = sales["shipping_zip"].fillna(sales["billing_zip"])
sales
Loading...