# Read in some datasets to work with!
import pandas as pd
houses = pd.read_csv("data/kc_house_data.csv")
titanic = pd.read_csv("data/titanic.csv")
netflix = pd.read_csv("data/netflix_titles.csv", sep="|", index_col=0)titanic.head()Loading...
Column selection¶
titanic.name0 Allen, Miss. Elisabeth Walton
1 Allison, Master. Hudson Trevor
2 Allison, Miss. Helen Loraine
3 Allison, Mr. Hudson Joshua Creighton
4 Allison, Mrs. Hudson J C (Bessie Waldo Daniels)
...
1304 Zabour, Miss. Hileni
1305 Zabour, Miss. Thamine
1306 Zakarian, Mr. Mapriededer
1307 Zakarian, Mr. Ortin
1308 Zimmerman, Mr. Leo
Name: name, Length: 1309, dtype: objecttitanic['name']0 Allen, Miss. Elisabeth Walton
1 Allison, Master. Hudson Trevor
2 Allison, Miss. Helen Loraine
3 Allison, Mr. Hudson Joshua Creighton
4 Allison, Mrs. Hudson J C (Bessie Waldo Daniels)
...
1304 Zabour, Miss. Hileni
1305 Zabour, Miss. Thamine
1306 Zakarian, Mr. Mapriededer
1307 Zakarian, Mr. Ortin
1308 Zimmerman, Mr. Leo
Name: name, Length: 1309, dtype: objecttitanic.age0 29
1 0.9167
2 2
3 30
4 25
...
1304 14.5
1305 ?
1306 26.5
1307 27
1308 29
Name: age, Length: 1309, dtype: objecttitanic["age"]0 29
1 0.9167
2 2
3 30
4 25
...
1304 14.5
1305 ?
1306 26.5
1307 27
1308 29
Name: age, Length: 1309, dtype: objecttitanic["home.dest"]
# titanic.home.dest - will not work btw :)0 St Louis, MO
1 Montreal, PQ / Chesterville, ON
2 Montreal, PQ / Chesterville, ON
3 Montreal, PQ / Chesterville, ON
4 Montreal, PQ / Chesterville, ON
...
1304 ?
1305 ?
1306 ?
1307 ?
1308 ?
Name: home.dest, Length: 1309, dtype: objectmystery_col = "price"houses[mystery_col]0 221900.0
1 538000.0
2 180000.0
3 604000.0
4 510000.0
...
21608 360000.0
21609 400000.0
21610 402101.0
21611 400000.0
21612 325000.0
Name: price, Length: 21613, dtype: float64titanicLoading...
names = titanic.name
type(names)pandas.core.series.Seriesnames0 Allen, Miss. Elisabeth Walton
1 Allison, Master. Hudson Trevor
2 Allison, Miss. Helen Loraine
3 Allison, Mr. Hudson Joshua Creighton
4 Allison, Mrs. Hudson J C (Bessie Waldo Daniels)
...
1304 Zabour, Miss. Hileni
1305 Zabour, Miss. Thamine
1306 Zakarian, Mr. Mapriededer
1307 Zakarian, Mr. Ortin
1308 Zimmerman, Mr. Leo
Name: name, Length: 1309, dtype: objecttitanicLoading...
houses.sum(numeric_only=True)id 9.899406e+13
price 1.167293e+10
bedrooms 7.285400e+04
bathrooms 4.570625e+04
sqft_living 4.495287e+07
sqft_lot 3.265069e+08
floors 3.229650e+04
waterfront 1.630000e+02
view 5.064000e+03
condition 7.368800e+04
grade 1.654880e+05
sqft_above 3.865249e+07
sqft_basement 6.300385e+06
yr_built 4.259933e+07
yr_renovated 1.824186e+06
zipcode 2.119759e+09
lat 1.027915e+06
long -2.641409e+06
sqft_living15 4.293536e+07
sqft_lot15 2.759646e+08
dtype: float64houses.price.sum()np.float64(11672925008.0)houses.price.max()np.float64(7700000.0)names.shape(1309,)titanic.shape(1309, 14)names.valuesarray(['Allen, Miss. Elisabeth Walton', 'Allison, Master. Hudson Trevor',
'Allison, Miss. Helen Loraine', ..., 'Zakarian, Mr. Mapriededer',
'Zakarian, Mr. Ortin', 'Zimmerman, Mr. Leo'],
shape=(1309,), dtype=object)names.indexRangeIndex(start=0, stop=1309, step=1)names0 Allen, Miss. Elisabeth Walton
1 Allison, Master. Hudson Trevor
2 Allison, Miss. Helen Loraine
3 Allison, Mr. Hudson Joshua Creighton
4 Allison, Mrs. Hudson J C (Bessie Waldo Daniels)
...
1304 Zabour, Miss. Hileni
1305 Zabour, Miss. Thamine
1306 Zakarian, Mr. Mapriededer
1307 Zakarian, Mr. Ortin
1308 Zimmerman, Mr. Leo
Name: name, Length: 1309, dtype: objectmins = houses.min(numeric_only=True)mins.indexIndex(['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above',
'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
'sqft_living15', 'sqft_lot15'],
dtype='object')minsid 1.000102e+06
price 7.500000e+04
bedrooms 0.000000e+00
bathrooms 0.000000e+00
sqft_living 2.900000e+02
sqft_lot 5.200000e+02
floors 1.000000e+00
waterfront 0.000000e+00
view 0.000000e+00
condition 1.000000e+00
grade 1.000000e+00
sqft_above 2.900000e+02
sqft_basement 0.000000e+00
yr_built 1.900000e+03
yr_renovated 0.000000e+00
zipcode 9.800100e+04
lat 4.715590e+01
long -1.225190e+02
sqft_living15 3.990000e+02
sqft_lot15 6.510000e+02
dtype: float64Important Series Methods¶
head()tail()describe()unique()nunique()nlargest()nsmallest()value_counts()plot()
titanic.age.head(10)0 29
1 0.9167
2 2
3 30
4 25
5 48
6 63
7 39
8 53
9 71
Name: age, dtype: objectnetflix.title.tail(50)8757 World Trade Center
8758 World's Busiest Cities
8759 World's Weirdest Homes
8760 Would You Rather
8761 Wrong No.
8762 Wrong Side Raju
8763 WWII: Report from the Aleutians
8764 Wyatt Earp
8765 XX
8766 XXx
8767 XXX: State of the Union
8768 Y Tu Mamá También
8769 Y.M.I.: Yeh Mera India
8770 Yaadein
8771 Yaara O Dildaara
8772 Yamla Pagla Deewana 2
8773 Yanda Kartavya Aahe
8774 يوم الدين
8775 Yeh Meri Family
8776 Yellowbird
8777 Yes or No
8778 Yes or No 2
8779 Yes or No 2.5
8780 Yo-Kai Watch
8781 Yo-Kai Watch: The Movie
8782 Yoga Hosers
8783 Yoko
8784 Yoko and His Friends
8785 YOM
8786 You Can Tutu
8787 You Can’t Fight Christmas
8788 You Carry Me
8789 You Changed My Life
8790 You Don't Mess with the Zohan
8791 Young Adult
8792 Young Tiger
8793 Yours, Mine and Ours
8794 اشتباك
8795 Yu-Gi-Oh! Arc-V
8796 Yunus Emre
8797 Zak Storm
8798 Zed Plus
8799 Zenda
8800 Zindagi Gulzar Hai
8801 Zinzana
8802 Zodiac
8803 Zombie Dumb
8804 Zombieland
8805 Zoom
8806 Zubaan
Name: title, dtype: objecthouses.describe()Loading...
houses["price"].describe()count 2.161300e+04
mean 5.400881e+05
std 3.671272e+05
min 7.500000e+04
25% 3.219500e+05
50% 4.500000e+05
75% 6.450000e+05
max 7.700000e+06
Name: price, dtype: float64netflix["rating"].describe()count 8803
unique 17
top TV-MA
freq 3207
Name: rating, dtype: objectnetflix.dtypesshow_id object
type object
title object
director object
cast object
country object
date_added object
release_year int64
rating object
duration object
listed_in object
description object
dtype: objectnetflix.release_year.describe()count 8807.000000
mean 2014.180198
std 8.819312
min 1925.000000
25% 2013.000000
50% 2017.000000
75% 2019.000000
max 2021.000000
Name: release_year, dtype: float64titanic.name0 Allen, Miss. Elisabeth Walton
1 Allison, Master. Hudson Trevor
2 Allison, Miss. Helen Loraine
3 Allison, Mr. Hudson Joshua Creighton
4 Allison, Mrs. Hudson J C (Bessie Waldo Daniels)
...
1304 Zabour, Miss. Hileni
1305 Zabour, Miss. Thamine
1306 Zakarian, Mr. Mapriededer
1307 Zakarian, Mr. Ortin
1308 Zimmerman, Mr. Leo
Name: name, Length: 1309, dtype: objecthouses["bedrooms"].unique()array([ 3, 2, 4, 5, 1, 6, 7, 0, 8, 9, 11, 10, 33])netflix.rating.unique()array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR', nan,
'TV-Y7-FV', 'UR'], dtype=object)netflix.rating0 PG-13
1 TV-MA
2 TV-MA
3 TV-MA
4 TV-MA
...
8802 R
8803 TV-Y7
8804 R
8805 PG
8806 TV-14
Name: rating, Length: 8807, dtype: object# number of unique `rating`
netflix.rating.nunique() 17houses.zipcode.nunique()70netflix.info()<class 'pandas.core.frame.DataFrame'>
Index: 8807 entries, 0 to 8806
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 show_id 8807 non-null object
1 type 8807 non-null object
2 title 8807 non-null object
3 director 6173 non-null object
4 cast 7982 non-null object
5 country 7976 non-null object
6 date_added 8797 non-null object
7 release_year 8807 non-null int64
8 rating 8803 non-null object
9 duration 8804 non-null object
10 listed_in 8807 non-null object
11 description 8807 non-null object
dtypes: int64(1), object(11)
memory usage: 894.5+ KB
netflix.head()Loading...
netflix.rating.nunique(dropna=False) # Includes NaN18netflix.rating.nunique()17# 9 largest
houses.price.nlargest(9) 7252 7700000.0
3914 7062500.0
9254 6885000.0
4411 5570000.0
1448 5350000.0
1315 5300000.0
1164 5110800.0
8092 4668000.0
2626 4500000.0
Name: price, dtype: float64houses.price.nsmallest(10)1149 75000.0
15293 78000.0
465 80000.0
16198 81000.0
8274 82000.0
2141 82500.0
18468 83000.0
3767 84000.0
10253 85000.0
16714 85000.0
Name: price, dtype: float64titanic.pclass.nlargest(5, keep="all")600 3
601 3
602 3
603 3
604 3
..
1304 3
1305 3
1306 3
1307 3
1308 3
Name: pclass, Length: 709, dtype: int64houses.nlargest(10,["price"]) # dataframeLoading...
houses.nlargest(10, ["bedrooms"])Loading...
houses.nlargest(10, ["bedrooms", "bathrooms"])Loading...
houses.nsmallest(10, ["sqft_lot"])Loading...
Selecting multiple columns¶
netflix[["title", "rating"]].tail(10)Loading...
houses[["price", "bedrooms", "bathrooms"]].describe()Loading...
housesLoading...
cols = ["price", "zipcode", "sqft_lot"]
houses[cols]Loading...
houses["bedrooms"]0 3
1 3
2 2
3 4
4 3
..
21608 3
21609 4
21610 2
21611 3
21612 2
Name: bedrooms, Length: 21613, dtype: int64value_counts¶
houses['bedrooms'].unique()array([ 3, 2, 4, 5, 1, 6, 7, 0, 8, 9, 11, 10, 33])houses["bedrooms"].value_counts() # unique values listbedrooms
3 9824
4 6882
2 2760
5 1601
6 272
1 199
7 38
0 13
8 13
9 6
10 3
11 1
33 1
Name: count, dtype: int64titanic.sex.value_counts()sex
male 843
female 466
Name: count, dtype: int64netflix.director.value_counts().head(10)director
Rajiv Chilaka 19
Raúl Campos, Jan Suter 18
Suhas Kadav 16
Marcus Raboy 16
Jay Karas 14
Cathy Garcia-Molina 13
Martin Scorsese 12
Youssef Chahine 12
Jay Chapman 12
Steven Spielberg 11
Name: count, dtype: int64netflix.director.value_counts(ascending=True)director
Lawrence Kasdan 1
Yasir Nawaz 1
S. Shankar 1
K.S. Ravikumar 1
Adam Salky 1
..
Jay Karas 14
Marcus Raboy 16
Suhas Kadav 16
Raúl Campos, Jan Suter 18
Rajiv Chilaka 19
Name: count, Length: 4528, dtype: int64houses["floors"].value_counts()floors
1.0 10680
2.0 8241
1.5 1910
3.0 613
2.5 161
3.5 8
Name: count, dtype: int64houses[["bedrooms", "bathrooms"]]Loading...
houses[["bedrooms", "bathrooms"]].value_counts()bedrooms bathrooms
4 2.50 2502
3 2.50 2357
1.75 1870
1.00 1780
2 1.00 1558
...
10 2.00 1
3.00 1
5.25 1
11 3.00 1
33 1.75 1
Name: count, Length: 144, dtype: int64type(houses[["bedrooms", "bathrooms"]].value_counts())pandas.core.series.Serieshouses.bedrooms0 3
1 3
2 2
3 4
4 3
..
21608 3
21609 4
21610 2
21611 3
21612 2
Name: bedrooms, Length: 21613, dtype: int64Plotting Intro!¶
houses.bedrooms.plot()<Axes: >
houses.bedrooms.value_counts().plot()<Axes: xlabel='bedrooms'>
houses.bedrooms.value_counts().plot(kind="bar")<Axes: xlabel='bedrooms'>
houses.bedrooms.value_counts().plot(kind="pie")<Axes: ylabel='count'>
titanic.survived.value_counts().plot(kind="pie")<Axes: ylabel='count'>
titanic.sex.value_counts().plot(kind="pie")<Axes: ylabel='count'>
houses.plot()<Axes: >
houses[["bedrooms", "bathrooms"]].plot()<Axes: >
df = houses[["bedrooms", "bathrooms"]]
df.plot(kind="scatter", x="bedrooms", y="bathrooms")<Axes: xlabel='bedrooms', ylabel='bathrooms'>
netflix.rating.value_counts().head(10).plot(kind="bar")<Axes: xlabel='rating'>