Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

Columns and Series Basics

# Read in some datasets to work with!
import pandas as pd
houses = pd.read_csv("data/kc_house_data.csv")
titanic = pd.read_csv("data/titanic.csv")
netflix = pd.read_csv("data/netflix_titles.csv", sep="|", index_col=0)
titanic.head()
Loading...

Column selection

titanic.name
0 Allen, Miss. Elisabeth Walton 1 Allison, Master. Hudson Trevor 2 Allison, Miss. Helen Loraine 3 Allison, Mr. Hudson Joshua Creighton 4 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) ... 1304 Zabour, Miss. Hileni 1305 Zabour, Miss. Thamine 1306 Zakarian, Mr. Mapriededer 1307 Zakarian, Mr. Ortin 1308 Zimmerman, Mr. Leo Name: name, Length: 1309, dtype: object
titanic['name']
0 Allen, Miss. Elisabeth Walton 1 Allison, Master. Hudson Trevor 2 Allison, Miss. Helen Loraine 3 Allison, Mr. Hudson Joshua Creighton 4 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) ... 1304 Zabour, Miss. Hileni 1305 Zabour, Miss. Thamine 1306 Zakarian, Mr. Mapriededer 1307 Zakarian, Mr. Ortin 1308 Zimmerman, Mr. Leo Name: name, Length: 1309, dtype: object
titanic.age
0 29 1 0.9167 2 2 3 30 4 25 ... 1304 14.5 1305 ? 1306 26.5 1307 27 1308 29 Name: age, Length: 1309, dtype: object
titanic["age"]
0 29 1 0.9167 2 2 3 30 4 25 ... 1304 14.5 1305 ? 1306 26.5 1307 27 1308 29 Name: age, Length: 1309, dtype: object
titanic["home.dest"]
# titanic.home.dest - will not work btw :)
0 St Louis, MO 1 Montreal, PQ / Chesterville, ON 2 Montreal, PQ / Chesterville, ON 3 Montreal, PQ / Chesterville, ON 4 Montreal, PQ / Chesterville, ON ... 1304 ? 1305 ? 1306 ? 1307 ? 1308 ? Name: home.dest, Length: 1309, dtype: object
mystery_col = "price"
houses[mystery_col]
0 221900.0 1 538000.0 2 180000.0 3 604000.0 4 510000.0 ... 21608 360000.0 21609 400000.0 21610 402101.0 21611 400000.0 21612 325000.0 Name: price, Length: 21613, dtype: float64
titanic
Loading...
names = titanic.name
type(names)
pandas.core.series.Series
names
0 Allen, Miss. Elisabeth Walton 1 Allison, Master. Hudson Trevor 2 Allison, Miss. Helen Loraine 3 Allison, Mr. Hudson Joshua Creighton 4 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) ... 1304 Zabour, Miss. Hileni 1305 Zabour, Miss. Thamine 1306 Zakarian, Mr. Mapriededer 1307 Zakarian, Mr. Ortin 1308 Zimmerman, Mr. Leo Name: name, Length: 1309, dtype: object
titanic
Loading...
houses.sum(numeric_only=True)
id 9.899406e+13 price 1.167293e+10 bedrooms 7.285400e+04 bathrooms 4.570625e+04 sqft_living 4.495287e+07 sqft_lot 3.265069e+08 floors 3.229650e+04 waterfront 1.630000e+02 view 5.064000e+03 condition 7.368800e+04 grade 1.654880e+05 sqft_above 3.865249e+07 sqft_basement 6.300385e+06 yr_built 4.259933e+07 yr_renovated 1.824186e+06 zipcode 2.119759e+09 lat 1.027915e+06 long -2.641409e+06 sqft_living15 4.293536e+07 sqft_lot15 2.759646e+08 dtype: float64
houses.price.sum()
np.float64(11672925008.0)
houses.price.max()
np.float64(7700000.0)
names.shape
(1309,)
titanic.shape
(1309, 14)
names.values
array(['Allen, Miss. Elisabeth Walton', 'Allison, Master. Hudson Trevor', 'Allison, Miss. Helen Loraine', ..., 'Zakarian, Mr. Mapriededer', 'Zakarian, Mr. Ortin', 'Zimmerman, Mr. Leo'], shape=(1309,), dtype=object)
names.index
RangeIndex(start=0, stop=1309, step=1)
names
0 Allen, Miss. Elisabeth Walton 1 Allison, Master. Hudson Trevor 2 Allison, Miss. Helen Loraine 3 Allison, Mr. Hudson Joshua Creighton 4 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) ... 1304 Zabour, Miss. Hileni 1305 Zabour, Miss. Thamine 1306 Zakarian, Mr. Mapriededer 1307 Zakarian, Mr. Ortin 1308 Zimmerman, Mr. Leo Name: name, Length: 1309, dtype: object
mins = houses.min(numeric_only=True)
mins.index
Index(['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15'], dtype='object')
mins
id 1.000102e+06 price 7.500000e+04 bedrooms 0.000000e+00 bathrooms 0.000000e+00 sqft_living 2.900000e+02 sqft_lot 5.200000e+02 floors 1.000000e+00 waterfront 0.000000e+00 view 0.000000e+00 condition 1.000000e+00 grade 1.000000e+00 sqft_above 2.900000e+02 sqft_basement 0.000000e+00 yr_built 1.900000e+03 yr_renovated 0.000000e+00 zipcode 9.800100e+04 lat 4.715590e+01 long -1.225190e+02 sqft_living15 3.990000e+02 sqft_lot15 6.510000e+02 dtype: float64

Important Series Methods

  • head()

  • tail()

  • describe()

  • unique()

  • nunique()

  • nlargest()

  • nsmallest()

  • value_counts()

  • plot()

titanic.age.head(10)
0 29 1 0.9167 2 2 3 30 4 25 5 48 6 63 7 39 8 53 9 71 Name: age, dtype: object
netflix.title.tail(50)
8757 World Trade Center 8758 World's Busiest Cities 8759 World's Weirdest Homes 8760 Would You Rather 8761 Wrong No. 8762 Wrong Side Raju 8763 WWII: Report from the Aleutians 8764 Wyatt Earp 8765 XX 8766 XXx 8767 XXX: State of the Union 8768 Y Tu Mamá También 8769 Y.M.I.: Yeh Mera India 8770 Yaadein 8771 Yaara O Dildaara 8772 Yamla Pagla Deewana 2 8773 Yanda Kartavya Aahe 8774 يوم الدين 8775 Yeh Meri Family 8776 Yellowbird 8777 Yes or No 8778 Yes or No 2 8779 Yes or No 2.5 8780 Yo-Kai Watch 8781 Yo-Kai Watch: The Movie 8782 Yoga Hosers 8783 Yoko 8784 Yoko and His Friends 8785 YOM 8786 You Can Tutu 8787 You Can’t Fight Christmas 8788 You Carry Me 8789 You Changed My Life 8790 You Don't Mess with the Zohan 8791 Young Adult 8792 Young Tiger 8793 Yours, Mine and Ours 8794 اشتباك 8795 Yu-Gi-Oh! Arc-V 8796 Yunus Emre 8797 Zak Storm 8798 Zed Plus 8799 Zenda 8800 Zindagi Gulzar Hai 8801 Zinzana 8802 Zodiac 8803 Zombie Dumb 8804 Zombieland 8805 Zoom 8806 Zubaan Name: title, dtype: object
houses.describe()
Loading...
houses["price"].describe()
count 2.161300e+04 mean 5.400881e+05 std 3.671272e+05 min 7.500000e+04 25% 3.219500e+05 50% 4.500000e+05 75% 6.450000e+05 max 7.700000e+06 Name: price, dtype: float64
netflix["rating"].describe()
count 8803 unique 17 top TV-MA freq 3207 Name: rating, dtype: object
netflix.dtypes
show_id object type object title object director object cast object country object date_added object release_year int64 rating object duration object listed_in object description object dtype: object
netflix.release_year.describe()
count 8807.000000 mean 2014.180198 std 8.819312 min 1925.000000 25% 2013.000000 50% 2017.000000 75% 2019.000000 max 2021.000000 Name: release_year, dtype: float64
titanic.name
0 Allen, Miss. Elisabeth Walton 1 Allison, Master. Hudson Trevor 2 Allison, Miss. Helen Loraine 3 Allison, Mr. Hudson Joshua Creighton 4 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) ... 1304 Zabour, Miss. Hileni 1305 Zabour, Miss. Thamine 1306 Zakarian, Mr. Mapriededer 1307 Zakarian, Mr. Ortin 1308 Zimmerman, Mr. Leo Name: name, Length: 1309, dtype: object
houses["bedrooms"].unique()
array([ 3, 2, 4, 5, 1, 6, 7, 0, 8, 9, 11, 10, 33])
netflix.rating.unique()
array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R', 'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR', nan, 'TV-Y7-FV', 'UR'], dtype=object)
netflix.rating
0 PG-13 1 TV-MA 2 TV-MA 3 TV-MA 4 TV-MA ... 8802 R 8803 TV-Y7 8804 R 8805 PG 8806 TV-14 Name: rating, Length: 8807, dtype: object
# number of unique `rating`
netflix.rating.nunique() 
17
houses.zipcode.nunique()
70
netflix.info()
<class 'pandas.core.frame.DataFrame'>
Index: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 894.5+ KB
netflix.head()
Loading...
netflix.rating.nunique(dropna=False) # Includes NaN
18
netflix.rating.nunique()
17
# 9 largest
houses.price.nlargest(9) 
7252 7700000.0 3914 7062500.0 9254 6885000.0 4411 5570000.0 1448 5350000.0 1315 5300000.0 1164 5110800.0 8092 4668000.0 2626 4500000.0 Name: price, dtype: float64
houses.price.nsmallest(10)
1149 75000.0 15293 78000.0 465 80000.0 16198 81000.0 8274 82000.0 2141 82500.0 18468 83000.0 3767 84000.0 10253 85000.0 16714 85000.0 Name: price, dtype: float64
titanic.pclass.nlargest(5, keep="all")
600 3 601 3 602 3 603 3 604 3 .. 1304 3 1305 3 1306 3 1307 3 1308 3 Name: pclass, Length: 709, dtype: int64
houses.nlargest(10,["price"]) # dataframe
Loading...
houses.nlargest(10, ["bedrooms"])
Loading...
houses.nlargest(10, ["bedrooms", "bathrooms"])
Loading...
houses.nsmallest(10, ["sqft_lot"])
Loading...

Selecting multiple columns

netflix[["title", "rating"]].tail(10)
Loading...
houses[["price", "bedrooms", "bathrooms"]].describe()
Loading...
houses
Loading...
cols = ["price", "zipcode", "sqft_lot"]
houses[cols]
Loading...
houses["bedrooms"]
0 3 1 3 2 2 3 4 4 3 .. 21608 3 21609 4 21610 2 21611 3 21612 2 Name: bedrooms, Length: 21613, dtype: int64

value_counts

houses['bedrooms'].unique()
array([ 3, 2, 4, 5, 1, 6, 7, 0, 8, 9, 11, 10, 33])
houses["bedrooms"].value_counts() # unique values list
bedrooms 3 9824 4 6882 2 2760 5 1601 6 272 1 199 7 38 0 13 8 13 9 6 10 3 11 1 33 1 Name: count, dtype: int64
titanic.sex.value_counts()
sex male 843 female 466 Name: count, dtype: int64
netflix.director.value_counts().head(10)
director Rajiv Chilaka 19 Raúl Campos, Jan Suter 18 Suhas Kadav 16 Marcus Raboy 16 Jay Karas 14 Cathy Garcia-Molina 13 Martin Scorsese 12 Youssef Chahine 12 Jay Chapman 12 Steven Spielberg 11 Name: count, dtype: int64
netflix.director.value_counts(ascending=True)
director Lawrence Kasdan 1 Yasir Nawaz 1 S. Shankar 1 K.S. Ravikumar 1 Adam Salky 1 .. Jay Karas 14 Marcus Raboy 16 Suhas Kadav 16 Raúl Campos, Jan Suter 18 Rajiv Chilaka 19 Name: count, Length: 4528, dtype: int64
houses["floors"].value_counts()
floors 1.0 10680 2.0 8241 1.5 1910 3.0 613 2.5 161 3.5 8 Name: count, dtype: int64
houses[["bedrooms", "bathrooms"]]
Loading...
houses[["bedrooms", "bathrooms"]].value_counts()
bedrooms bathrooms 4 2.50 2502 3 2.50 2357 1.75 1870 1.00 1780 2 1.00 1558 ... 10 2.00 1 3.00 1 5.25 1 11 3.00 1 33 1.75 1 Name: count, Length: 144, dtype: int64
type(houses[["bedrooms", "bathrooms"]].value_counts())
pandas.core.series.Series
houses.bedrooms
0 3 1 3 2 2 3 4 4 3 .. 21608 3 21609 4 21610 2 21611 3 21612 2 Name: bedrooms, Length: 21613, dtype: int64

Plotting Intro!

houses.bedrooms.plot()
<Axes: >
<Figure size 640x480 with 1 Axes>
houses.bedrooms.value_counts().plot()
<Axes: xlabel='bedrooms'>
<Figure size 640x480 with 1 Axes>
houses.bedrooms.value_counts().plot(kind="bar")
<Axes: xlabel='bedrooms'>
<Figure size 640x480 with 1 Axes>
houses.bedrooms.value_counts().plot(kind="pie")
<Axes: ylabel='count'>
<Figure size 640x480 with 1 Axes>
titanic.survived.value_counts().plot(kind="pie")
<Axes: ylabel='count'>
<Figure size 640x480 with 1 Axes>
titanic.sex.value_counts().plot(kind="pie")
<Axes: ylabel='count'>
<Figure size 640x480 with 1 Axes>
houses.plot()
<Axes: >
<Figure size 640x480 with 1 Axes>
houses[["bedrooms", "bathrooms"]].plot()
<Axes: >
<Figure size 640x480 with 1 Axes>
df = houses[["bedrooms", "bathrooms"]]
df.plot(kind="scatter", x="bedrooms", y="bathrooms")
<Axes: xlabel='bedrooms', ylabel='bathrooms'>
<Figure size 640x480 with 1 Axes>
netflix.rating.value_counts().head(10).plot(kind="bar")
<Axes: xlabel='rating'>
<Figure size 640x480 with 1 Axes>