Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

Pandas Filtering Data

Pandas Filtering Data

import pandas as pd
houses = pd.read_csv("data/kc_house_data.csv")
titanic = pd.read_csv("data/titanic.csv")
netflix = pd.read_csv("data/netflix_titles.csv", sep="|", index_col=0)

Basic filtering

df = titanic.head()
df.sex
0 female 1 male 2 female 3 male 4 female Name: sex, dtype: object
df.sex == 'female'
0 True 1 False 2 True 3 False 4 True Name: sex, dtype: bool
df[df.sex == 'female']
Loading...
df
Loading...
df[df["survived"] == 0]
Loading...
bools = [True, False, True, True, True]
df[bools]
Loading...
titanic[titanic.survived == 1]
Loading...
titanic[titanic.age == "18"]
Loading...
titanic[titanic.pclass != 1]
Loading...
houses[houses["price"] > 5000000]
Loading...
houses[houses['bedrooms'] > 10]
Loading...
houses[houses['bedrooms'] >= 10]
Loading...
houses[houses["sqft_living"] < 500]
Loading...
houses[houses["sqft_living"] <= 500]
Loading...
houses[houses["bedrooms"].between(5, 7)]
Loading...
houses[houses["grade"].between(11,13)]
Loading...

isin()

countries = ["India", "Japan", "South Korea"]
netflix[netflix["country"].isin(countries)]
Loading...
mature = netflix["rating"].isin(["TV-MA", "R", "PG-13"])
netflix[mature]
Loading...
women = titanic.sex == 'female'
died = titanic.survived == 0
titanic[women & died]
Loading...
houses[(houses["waterfront"] == 1) & (houses["price"] < 500000)]
Loading...
houses[houses["view"]== 4]
houses[houses["grade"] >= 11]
houses[(houses["grade"] >= 11) & (houses["view"] == 4)]
Loading...
high_quality = houses["grade"] >= 11
good_view = houses["view"] == 4
smaller = houses["sqft_living"] <= 3000
houses[high_quality & good_view & smaller]
Loading...
houses[houses["yr_built"] >= 2014]
houses[houses["yr_renovated"] >= 2014]
houses[(houses["yr_built"] >= 2014) | (houses["yr_renovated"] >= 2014)]
Loading...
netflix[netflix["director"] == "David Fincher"]
netflix[netflix["director"] == "Martin Scorsese"]
netflix[(netflix["director"] == "David Fincher") | (netflix["director"] == "Martin Scorsese")]
netflix[netflix["director"].isin(["David Fincher", "Martin Scorsese"])]
Loading...
fincher = netflix["director"] == "David Fincher"
scorsese = netflix["director"] == "Martin Scorsese"
recent = netflix["release_year"] > 2015
netflix[(fincher | scorsese) & recent]
Loading...
df = titanic.head()
women = df.sex == 'female'

# ~ means negation
df[~women] 
Loading...
newly_built = houses["yr_built"] >= 2014
newly_renovated = houses["yr_renovated"] >= 2014
recent_homes = newly_built | newly_renovated
houses[recent_homes]
houses[~recent_homes]
Loading...
titanic[titanic.survived == 0]
titanic[~(titanic.survived == 0)]
Loading...
netflix.info()
<class 'pandas.core.frame.DataFrame'>
Index: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 894.5+ KB

isna()

netflix[netflix["director"].isna()]
Loading...
netflix[netflix["director"].isna() & netflix["cast"].isna()]
Loading...

notna()

netflix[~netflix.director.notna()]
Loading...

Some plots

titanic[titanic["sex"] == "female"].survived.value_counts()
survived 1 339 0 127 Name: count, dtype: int64
titanic[titanic["sex"] == "male"].survived.value_counts()
survived 0 682 1 161 Name: count, dtype: int64
women = titanic["sex"] == "female"
titanic[women].survived.value_counts().plot(kind="pie")
<Axes: ylabel='count'>
<Figure size 640x480 with 1 Axes>
titanic[~women].survived.value_counts().plot(kind="pie")
<Axes: ylabel='count'>
<Figure size 640x480 with 1 Axes>
houses[houses["price"] > 3000000].zipcode.value_counts().plot(kind="bar")
<Axes: xlabel='zipcode'>
<Figure size 640x480 with 1 Axes>
houses.zipcode.value_counts().head(10).plot(kind="bar")
<Axes: xlabel='zipcode'>
<Figure size 640x480 with 1 Axes>