Pandas Filtering Data¶

import pandas as pd
houses = pd.read_csv("data/kc_house_data.csv")
titanic = pd.read_csv("data/titanic.csv")
netflix = pd.read_csv("data/netflix_titles.csv", sep="|", index_col=0)

Basic filtering¶

df = titanic.head()
df.sex

0    female
1      male
2    female
3      male
4    female
Name: sex, dtype: object

df.sex == 'female'

0     True
1    False
2     True
3    False
4     True
Name: sex, dtype: bool

df[df.sex == 'female']

df

df[df["survived"] == 0]

bools = [True, False, True, True, True]
df[bools]

titanic[titanic.survived == 1]

titanic[titanic.age == "18"]

titanic[titanic.pclass != 1]

houses[houses["price"] > 5000000]

houses[houses['bedrooms'] > 10]

houses[houses['bedrooms'] >= 10]

houses[houses["sqft_living"] < 500]

houses[houses["sqft_living"] <= 500]

houses[houses["bedrooms"].between(5, 7)]

houses[houses["grade"].between(11,13)]

isin()¶

countries = ["India", "Japan", "South Korea"]
netflix[netflix["country"].isin(countries)]

mature = netflix["rating"].isin(["TV-MA", "R", "PG-13"])
netflix[mature]

women = titanic.sex == 'female'
died = titanic.survived == 0
titanic[women & died]

houses[(houses["waterfront"] == 1) & (houses["price"] < 500000)]

houses[houses["view"]== 4]
houses[houses["grade"] >= 11]
houses[(houses["grade"] >= 11) & (houses["view"] == 4)]

high_quality = houses["grade"] >= 11
good_view = houses["view"] == 4
smaller = houses["sqft_living"] <= 3000
houses[high_quality & good_view & smaller]

houses[houses["yr_built"] >= 2014]
houses[houses["yr_renovated"] >= 2014]
houses[(houses["yr_built"] >= 2014) | (houses["yr_renovated"] >= 2014)]

netflix[netflix["director"] == "David Fincher"]
netflix[netflix["director"] == "Martin Scorsese"]
netflix[(netflix["director"] == "David Fincher") | (netflix["director"] == "Martin Scorsese")]
netflix[netflix["director"].isin(["David Fincher", "Martin Scorsese"])]

fincher = netflix["director"] == "David Fincher"
scorsese = netflix["director"] == "Martin Scorsese"
recent = netflix["release_year"] > 2015
netflix[(fincher | scorsese) & recent]

df = titanic.head()
women = df.sex == 'female'

# ~ means negation
df[~women]

newly_built = houses["yr_built"] >= 2014
newly_renovated = houses["yr_renovated"] >= 2014
recent_homes = newly_built | newly_renovated
houses[recent_homes]
houses[~recent_homes]

titanic[titanic.survived == 0]
titanic[~(titanic.survived == 0)]

netflix.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 894.5+ KB

isna()¶

netflix[netflix["director"].isna()]

netflix[netflix["director"].isna() & netflix["cast"].isna()]

notna()¶

netflix[~netflix.director.notna()]

Some plots¶

titanic[titanic["sex"] == "female"].survived.value_counts()

survived
1    339
0    127
Name: count, dtype: int64

titanic[titanic["sex"] == "male"].survived.value_counts()

survived
0    682
1    161
Name: count, dtype: int64

women = titanic["sex"] == "female"
titanic[women].survived.value_counts().plot(kind="pie")

<Axes: ylabel='count'>

titanic[~women].survived.value_counts().plot(kind="pie")

<Axes: ylabel='count'>

houses[houses["price"] > 3000000].zipcode.value_counts().plot(kind="bar")

<Axes: xlabel='zipcode'>

houses.zipcode.value_counts().head(10).plot(kind="bar")

<Axes: xlabel='zipcode'>