Pandas Filtering Data¶
import pandas as pd
houses = pd.read_csv("data/kc_house_data.csv")
titanic = pd.read_csv("data/titanic.csv")
netflix = pd.read_csv("data/netflix_titles.csv", sep="|", index_col=0)Basic filtering¶
df = titanic.head()
df.sex0 female
1 male
2 female
3 male
4 female
Name: sex, dtype: objectdf.sex == 'female'0 True
1 False
2 True
3 False
4 True
Name: sex, dtype: booldf[df.sex == 'female']Loading...
dfLoading...
df[df["survived"] == 0]Loading...
bools = [True, False, True, True, True]
df[bools]Loading...
titanic[titanic.survived == 1]Loading...
titanic[titanic.age == "18"]Loading...
titanic[titanic.pclass != 1]Loading...
houses[houses["price"] > 5000000]Loading...
houses[houses['bedrooms'] > 10]Loading...
houses[houses['bedrooms'] >= 10]Loading...
houses[houses["sqft_living"] < 500]Loading...
houses[houses["sqft_living"] <= 500]Loading...
houses[houses["bedrooms"].between(5, 7)]Loading...
houses[houses["grade"].between(11,13)]Loading...
isin()¶
countries = ["India", "Japan", "South Korea"]
netflix[netflix["country"].isin(countries)]Loading...
mature = netflix["rating"].isin(["TV-MA", "R", "PG-13"])
netflix[mature]Loading...
women = titanic.sex == 'female'
died = titanic.survived == 0
titanic[women & died]Loading...
houses[(houses["waterfront"] == 1) & (houses["price"] < 500000)]Loading...
houses[houses["view"]== 4]
houses[houses["grade"] >= 11]
houses[(houses["grade"] >= 11) & (houses["view"] == 4)]Loading...
high_quality = houses["grade"] >= 11
good_view = houses["view"] == 4
smaller = houses["sqft_living"] <= 3000
houses[high_quality & good_view & smaller]Loading...
houses[houses["yr_built"] >= 2014]
houses[houses["yr_renovated"] >= 2014]
houses[(houses["yr_built"] >= 2014) | (houses["yr_renovated"] >= 2014)]Loading...
netflix[netflix["director"] == "David Fincher"]
netflix[netflix["director"] == "Martin Scorsese"]
netflix[(netflix["director"] == "David Fincher") | (netflix["director"] == "Martin Scorsese")]
netflix[netflix["director"].isin(["David Fincher", "Martin Scorsese"])]Loading...
fincher = netflix["director"] == "David Fincher"
scorsese = netflix["director"] == "Martin Scorsese"
recent = netflix["release_year"] > 2015
netflix[(fincher | scorsese) & recent]Loading...
df = titanic.head()
women = df.sex == 'female'
# ~ means negation
df[~women] Loading...
newly_built = houses["yr_built"] >= 2014
newly_renovated = houses["yr_renovated"] >= 2014
recent_homes = newly_built | newly_renovated
houses[recent_homes]
houses[~recent_homes]Loading...
titanic[titanic.survived == 0]
titanic[~(titanic.survived == 0)]Loading...
netflix.info()<class 'pandas.core.frame.DataFrame'>
Index: 8807 entries, 0 to 8806
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 show_id 8807 non-null object
1 type 8807 non-null object
2 title 8807 non-null object
3 director 6173 non-null object
4 cast 7982 non-null object
5 country 7976 non-null object
6 date_added 8797 non-null object
7 release_year 8807 non-null int64
8 rating 8803 non-null object
9 duration 8804 non-null object
10 listed_in 8807 non-null object
11 description 8807 non-null object
dtypes: int64(1), object(11)
memory usage: 894.5+ KB
isna()¶
netflix[netflix["director"].isna()]Loading...
netflix[netflix["director"].isna() & netflix["cast"].isna()]Loading...
notna()¶
netflix[~netflix.director.notna()]Loading...
Some plots¶
titanic[titanic["sex"] == "female"].survived.value_counts()survived
1 339
0 127
Name: count, dtype: int64titanic[titanic["sex"] == "male"].survived.value_counts()survived
0 682
1 161
Name: count, dtype: int64women = titanic["sex"] == "female"
titanic[women].survived.value_counts().plot(kind="pie")<Axes: ylabel='count'>
titanic[~women].survived.value_counts().plot(kind="pie")<Axes: ylabel='count'>
houses[houses["price"] > 3000000].zipcode.value_counts().plot(kind="bar")<Axes: xlabel='zipcode'>
houses.zipcode.value_counts().head(10).plot(kind="bar")<Axes: xlabel='zipcode'>