Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

Pandas Groupby & Aggregates

import pandas as pd
carstocks = pd.read_csv("data/car_stocks.csv")
carstocks
Loading...
carstocks["Close"].mean()
79.83897420512822
carstocks[carstocks["Symbol"] == "RIVN"]["Close"].mean()
127.52307653846154
carstocks[carstocks["Symbol"] == "GM"]["Close"].mean()
62.16461546153845
carstocks[carstocks["Symbol"] == "LCID"]["Close"].mean()
49.82923061538462

Groupby Basics

carstocks.groupby("Symbol")["Close"].mean()
Symbol GM 62.164615 LCID 49.829231 RIVN 127.523077 Name: Close, dtype: float64
titanic = pd.read_csv("data/titanic.csv")
titanic['age'] = titanic["age"].replace(['?'], [None]).astype('float')
df = titanic[["pclass", "survived", "sex", "age"]]
gbo = df.groupby(by="sex")
gbo
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fd91832b400>
gbo.ngroups
2
gbo.groups
{'female': [0, 2, 4, 6, 8, 11, 12, 13, 17, 18, 21, 23, 24, 27, 28, 32, 33, 35, 36, 41, 42, 43, 44, 48, 50, 55, 57, 59, 61, 63, 65, 66, 67, 69, 72, 73, 76, 78, 79, 82, 83, 85, 88, 90, 92, 95, 97, 98, 99, 102, 103, 104, 105, 107, 108, 111, 112, 113, 116, 117, 121, 122, 124, 127, 129, 130, 131, 134, 137, 139, 141, 144, 146, 149, 153, 155, 159, 160, 161, 163, 167, 168, 169, 176, 178, 180, 181, 182, 186, 187, 188, 190, 192, 193, 195, 198, 199, 204, 207, 208, ...], 'male': [1, 3, 5, 7, 9, 10, 14, 15, 16, 19, 20, 22, 25, 26, 29, 30, 31, 34, 37, 38, 39, 40, 45, 46, 47, 49, 51, 52, 53, 54, 56, 58, 60, 62, 64, 68, 70, 71, 74, 75, 77, 80, 81, 84, 86, 87, 89, 91, 93, 94, 96, 100, 101, 106, 109, 110, 114, 115, 118, 119, 120, 123, 125, 126, 128, 132, 133, 135, 136, 138, 140, 142, 143, 145, 147, 148, 150, 151, 152, 154, 156, 157, 158, 162, 164, 165, 166, 170, 171, 172, 173, 174, 175, 177, 179, 183, 184, 185, 189, 191, ...]}
df
Loading...
df.groupby("age").groups
{0.1667: [763], 0.3333: [747], 0.4167: [1240], 0.6667: [427], 0.75: [657, 658, 1111], 0.8333: [359, 548, 611], 0.9167: [1, 590], 1.0: [339, 478, 492, 762, 826, 895, 937, 1048, 1101, 1187], 2.0: [2, 514, 540, 587, 624, 866, 1096, 1103, 1144, 1156, 1209, 1230], 3.0: [479, 515, 549, 641, 734, 1098, 1112], 4.0: [94, 340, 588, 622, 894, 916, 934, 1142, 1189, 1206], 5.0: [591, 639, 643, 659, 794], 6.0: [273, 430, 623, 678, 1025, 1097], 7.0: [434, 1102, 1143, 1256], 8.0: [377, 385, 398, 541, 1099, 1145], 9.0: [627, 640, 679, 733, 807, 820, 825, 1082, 1208, 1257], 10.0: [828, 1141, 1207, 1265], 11.0: [54, 628, 827, 855], 11.5: [1263], 12.0: [341, 582, 1056], 13.0: [249, 501, 601, 642, 653], 14.0: [55, 513, 569, 830, 1057, 1105, 1236, 1279], 14.5: [1171, 1304], 15.0: [193, 350, 792, 1007, 1047, 1300], 16.0: [159, 187, 195, 416, 510, 602, 604, 709, 761, 787, 810, 818, 829, 1093, 1104, 1161, 1232, 1244, 1275], 17.0: [53, 92, 229, 295, 390, 458, 482, 625, 650, 700, 701, 738, 740, 755, 772, 791, 841, 885, 910, 1133], 18.0: [11, 198, 228, 250, 270, 289, 326, 331, 386, 394, 395, 405, 408, 445, 558, 607, 612, 619, 636, 661, 665, 676, 695, 698, 717, 719, 786, 799, 809, 859, 938, 1045, 1060, 1130, 1157, 1205, 1260, 1273, 1288], 18.5: [568, 692, 919], 19.0: [27, 114, 137, 197, 216, 337, 344, 364, 503, 518, 530, 534, 552, 621, 669, 694, 731, 737, 744, 771, 777, 839, 898, 1011, 1050, 1108, 1127, 1217, 1226], 20.0: [353, 446, 520, 559, 606, 615, 633, 654, 664, 673, 687, 718, 836, 846, 883, 907, 970, 1049, 1089, 1091, 1092, 1191, 1278], 20.5: [977], 21.0: [190, 251, 307, 315, 317, 383, 404, 419, 420, 444, 453, 533, 553, 648, 675, 685, 693, 696, 702, 703, 704, 713, 754, 806, 850, 881, 908, 911, 913, 1017, 1020, 1062, 1065, 1107, 1117, 1140, 1182, 1204, 1224, 1289, 1295], 22.0: [36, 73, 122, 130, 220, 227, 236, 361, 380, 463, 468, 481, 523, 668, 671, 686, 689, 690, 725, 743, 753, 769, 785, 817, 862, 867, 890, 915, 932, 947, 952, 953, 986, 1046, 1067, 1079, 1119, 1147, 1201, 1227, 1277, 1280, 1281], 22.5: [741], 23.0: [102, 113, 140, 214, 225, 272, 332, 345, 403, 447, 465, 525, 547, 571, 579, 645, 649, 652, 780, 784, 861, 904, 980, 1075, 1090, 1223], 23.5: [847], 24.0: [12, 16, 111, 132, 153, 199, 255, 268, 271, 349, 376, 392, 421, 422, 428, 437, 438, 442, 460, 462, 467, 486, 494, 550, 599, 616, 637, 660, 708, 712, 720, 722, 752, 779, 783, 840, 845, 965, 985, 1010, 1019, 1040, 1125, 1134, 1181, 1188, 1237], 24.5: [1192], 25.0: [4, 25, 26, 143, 144, 327, 354, 356, 370, 393, 480, 555, 557, 565, 567, 605, 617, 635, 751, 766, 814, 853, 878, 941, 966, 1024, 1118, 1120, 1129, 1165, 1190, 1234, 1238, 1254], 26.0: [13, 22, 72, 346, 360, 417, 475, 517, 554, 598, 609, 613, 614, 631, 634, 663, 670, 677, 716, 764, 803, 849, 860, 889, 933, 936, 949, 975, 1061, 1113], 26.5: [1306], 27.0: [64, 71, 87, 90, 97, 151, 313, 348, 401, 507, 539, 556, 573, 575, 585, 630, 667, 730, 750, 857, 870, 877, 899, 906, 978, 1026, 1229, 1296, 1299, 1307], 28.0: [29, 52, 112, 203, 265, 324, 334, 338, 373, 375, 388, 431, 519, 527, 546, 572, 707, 711, 749, 838, 844, 863, 864, 869, 972, 1021, 1059, 1083, 1087, 1126, 1270, 1271], 28.5: [222, 1066, 1294], 29.0: [0, 24, 189, 226, 369, 372, 374, 391, 407, 452, 521, 526, 574, 586, 589, 688, 715, 746, 880, 893, 935, 950, 951, 990, 1058, 1100, 1196, 1231, 1258, 1308], 30.0: [3, 32, 67, 110, 117, 182, 191, 194, 209, 230, 258, 323, 325, 381, 402, 424, 426, 433, 476, 496, 499, 538, 545, 560, 562, 578, 608, 651, 697, 726, 732, 745, 760, 778, 875, 912, 969, 974, 1218, 1267], 30.5: [992, 1251], 31.0: [89, 91, 213, 239, 298, 309, 319, 378, 379, 474, 493, 577, 580, 596, 723, 724, 823, 892, 1086, 1094, 1228, 1274, 1276], 32.0: [18, 278, 336, 389, 443, 464, 498, 536, 620, 655, 674, 684, 714, 776, 834, 905, 909, 959, 981, 1088, 1110, 1131, 1220, 1248], 32.5: [173, 512, 584, 1285], 33.0: [51, 65, 88, 207, 242, 245, 248, 457, 542, 593, 656, 765, 781, 821, 891, 897, 914, 996, 1051, 1222, 1269], 34.0: [259, 328, 343, 396, 400, 414, 415, 423, 466, 484, 537, 544, 748, 888, 1031, 1239], 34.5: [924, 960], 35.0: [28, 127, 129, 149, 163, 164, 167, 183, 257, 261, 302, 362, 412, 470, 563, 603, 618, 638, 691, 729, 995, 1008, 1148], 36.0: [19, 49, 56, 57, 60, 66, 82, 105, 109, 202, 244, 322, 329, 333, 342, 355, 409, 448, 485, 543, 592, 721, 735, 759, 770, 939, 963, 968, 1259, 1266, 1298], 36.5: [516, 758], 37.0: [20, 77, 126, 208, 212, 368, 710, 837, 943], 38.0: [85, 103, 138, 165, 168, 234, 411, 413, 626, 646, 699, 824, 1139, 1286], 38.5: [1169], 39.0: [7, 76, 84, 101, 180, 218, 263, 291, 296, 504, 509, 522, 629, 632, 790, 917, 964, 1064, 1146, 1183], 40.0: [31, 150, 260, 275, 299, 352, 406, 497, 564, 576, 583, 610, 644, 662, 683, 831, 1203, 1210], 40.5: [796, 797, 1264], 41.0: [38, 44, 175, 502, 532, 566, 822, 848, 1063, 1106, 1158], 42.0: [23, 34, 47, 156, 162, 177, 185, 347, 357, 358, 399, 454, 459, 489, 600, 773, 873, 1084], 43.0: [120, 238, 281, 435, 535, 728, 774, 832, 868], 44.0: [41, 160, 206, 365, 429, 451, 739, 925, 1233, 1252], 45.0: [30, 35, 45, 68, 131, 141, 148, 178, 240, 274, 311, 371, 436, 471, 647, 666, 742, 788, 851, 967, 1211], 45.5: [224, 1301], 46.0: [62, 142, 171, 200, 243, 500], 47.0: [10, 21, 63, 75, 128, 211, 232, 301, 418, 461, 793, 1153, 1272, 1290], 48.0: [5, 39, 98, 99, 124, 145, 253, 288, 292, 387, 440, 505, 811, 884], 49.0: [58, 100, 133, 146, 170, 181, 294, 439, 896], 50.0: [17, 96, 119, 169, 172, 262, 312, 314, 450, 488, 529, 551, 570, 1152, 1160], 51.0: [86, 161, 188, 316, 335, 833, 979, 1287], 52.0: [155, 231, 282, 290, 367, 425], 53.0: [8, 48, 93, 136], 54.0: [95, 104, 201, 247, 280, 306, 366, 397, 449, 508], 55.0: [79, 154, 158, 186, 210, 246, 308, 441], 55.5: [1015], 56.0: [233, 264, 266, 267], 57.0: [276, 310, 330, 472, 490], 58.0: [33, 50, 139, 174, 192, 215], 59.0: [42, 561, 736], 60.0: [43, 116, 123, 304, 305, 351, 456], 60.5: [1225], 61.0: [252, 287, 300, 487, 1068], 62.0: [279, 284, 321, 432, 511], 63.0: [6, 286, 455, 1261], 64.0: [78, 83, 115, 217, 303], 65.0: [205, 221, 782], 66.0: [594], 67.0: [285], 70.0: [81, 506], 70.5: [727], 71.0: [9, 135], 74.0: [1235], 76.0: [61], 80.0: [14]}
df.groupby("age").first()
Loading...
gbo.get_group("male")
Loading...
for name, group in gbo:
    print(name)
    print("----------")
    print(group)
female
----------
      pclass  survived     sex   age
0          1         1  female  29.0
2          1         0  female   2.0
4          1         0  female  25.0
6          1         1  female  63.0
8          1         1  female  53.0
...      ...       ...     ...   ...
1286       3         1  female  38.0
1290       3         1  female  47.0
1300       3         1  female  15.0
1304       3         0  female  14.5
1305       3         0  female   NaN

[466 rows x 4 columns]
male
----------
      pclass  survived   sex      age
1          1         1  male   0.9167
3          1         0  male  30.0000
5          1         1  male  48.0000
7          1         0  male  39.0000
9          1         0  male  71.0000
...      ...       ...   ...      ...
1302       3         0  male      NaN
1303       3         0  male      NaN
1306       3         0  male  26.5000
1307       3         0  male  27.0000
1308       3         0  male  29.0000

[843 rows x 4 columns]
gbo["age"].mean()
sex female 28.687071 male 30.585233 Name: age, dtype: float64
gbo["age"].max()
sex female 76.0 male 80.0 Name: age, dtype: float64
gbo["age"].mean().plot(kind="bar", title="Average Age By Sex")
<Figure size 432x288 with 1 Axes>
titanic.groupby("pclass")["age"].mean()
pclass 1 39.159918 2 29.506705 3 24.816367 Name: age, dtype: float64
titanic.groupby("sex")["pclass"].mean()
sex female 2.154506 male 2.372479 Name: pclass, dtype: float64
titanic.groupby("sex").mean()
Loading...
titanic.groupby("sex").median()
Loading...
carstocks.groupby("Symbol")["High"].max()
Symbol GM 65.180000 LCID 57.750000 RIVN 179.470001 Name: High, dtype: float64

Agg

titanic.groupby("sex")["age"].agg("min")
sex female 0.1667 male 0.3333 Name: age, dtype: float64
titanic.groupby("sex")["age"].agg(["min", "max", "mean", "median"])
Loading...
titanic.groupby("sex").agg(["min", "max"])
Loading...
titanic.groupby("sex").agg({"age": ["min", "max"], "pclass": "mean"})
Loading...
carstocks.groupby("Symbol").agg({"Open": "mean", "Close": "mean", "Volume": ["mean", "sum"]})
Loading...
def range(s):
    return s.max() - s.min()
    
titanic.groupby("pclass")["age"].agg(["min", "max", range])
Loading...
titanic["age"].size - titanic["age"].count()
263
def count_nulls(s):
    return s.size - s.count()

titanic.groupby("pclass")["age"].agg(count_nulls)
pclass 1 39.0 2 16.0 3 208.0 Name: age, dtype: float64
carstocks.groupby("Symbol").agg({"Open": ["min", "max"],"Close": ["min", "max"], })
Loading...
carstocks.groupby("Symbol").agg(
    min_open=("Open", "min"),
    max_open=("Open", "max"),
    min_close=("Close", "min"),
    max_close=("Close", "max")
)
Loading...
carstocks.groupby("Symbol", as_index=False).agg({
    "Open": ["min", "max"],
    "Close": ["min", "max"], 
})
Loading...
carstocks.groupby("Symbol").agg({
    "Open": ["min", "max"],
    "Close": ["min", "max"], 
})
Loading...

Grouping By Multiple Columns & Multi Indexes

titanic.groupby("sex")["age"].mean()
sex female 28.687071 male 30.585233 Name: age, dtype: float64
titanic.groupby(["sex", "pclass", "survived"])["age"].mean()
sex pclass survived female 1 0 35.200000 1 37.109375 2 0 34.090909 1 26.711051 3 0 23.418750 1 20.814815 male 1 0 43.658163 1 36.168240 2 0 33.092593 1 17.449274 3 0 26.679598 1 22.436441 Name: age, dtype: float64
titanic.groupby(["sex", "pclass"]).mean()
Loading...
titanic.head()
Loading...
df = titanic.groupby(["sex", "pclass"]).mean()
df
Loading...
df.loc[[("male", 3)]]
Loading...
titanic.loc[[1,2]]
Loading...
df.loc[[("female", 3), ("male", 1)]]
Loading...