Data Visualization With Pandas and Matplotlib - Data Visualization With Python
Data Visualization With Pandas and Matplotlib - Data Visualization With Python
# read data
drink_cols = ["country", 'beer', 'spirit', 'wine', 'liters',
'continent']
drinks = pd.read_csv("../data/drinks.csv", header=0, names=drink_cols,
na_filter=False)
Data Exploration
# examine first few rows
drinks.head()
0 Afghanistan 0 0 0 0.0 AS
2 Algeria 25 0 14 0.7 AF
(193, 6)
# data structure
drinks.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 country 193 non-null object
1 beer 193 non-null int64
2 spirit 193 non-null int64
3 wine 193 non-null int64
4 liters 193 non-null float64
5 continent 193 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB
# numerical summary
drinks.describe()
array([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,
0, 0, 1, 1, 1, 1, 2, 3, 5, 5, 5, 5,
5,
6, 6, 6, 6, 8, 8, 8, 9, 9, 9, 9, 12,
13,
15, 15, 16, 16, 17, 18, 19, 19, 20, 20, 21, 21,
21,
21, 22, 23, 25, 25, 25, 25, 26, 28, 31, 31, 31,
31,
32, 32, 34, 36, 36, 36, 37, 42, 42, 43, 44, 45,
47,
49, 51, 51, 52, 52, 52, 53, 56, 56, 57, 58, 60,
62,
62, 63, 64, 69, 71, 76, 76, 77, 77, 77, 78, 79,
82,
82, 85, 88, 89, 90, 92, 93, 93, 98, 99, 102, 105,
106,
109, 111, 115, 120, 122, 124, 127, 128, 130, 133, 140, 142,
143,
144, 147, 149, 149, 152, 157, 159, 162, 163, 167, 169, 171,
173,
185, 188, 192, 193, 193, 194, 194, 196, 197, 199, 203, 206,
213,
217, 219, 224, 224, 225, 230, 231, 233, 234, 236, 238, 240,
245,
245, 247, 249, 251, 261, 263, 263, 270, 279, 281, 283, 284,
285,
295, 297, 306, 313, 333, 343, 343, 346, 347, 361, 376])
Print to PDF
array([[ 0, 0],
[ 0, 74],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 1, 7],
[ 1, 1],
[ 1, 4],
[ 1, 1],
[ 2, 0],
[ 3, 1],
[ 5, 0],
[ 5, 0],
[ 5, 16],
[ 5, 1],
[ 5, 0],
[ 6, 1],
[ 6, 0],
[ 6, 1],
[ 6, 9],
[ 8, 0],
[ 8, 1],
[ 8, 1],
[ 9, 2],
[ 9, 0],
[ 9, 7],
[ 9, 0],
[ 12, 10],
[ 13, 0],
[ 15, 3],
[ 15, 1],
[ 16, 5],
[ 16, 0],
[ 17, 1],
[ 18, 0],
[ 19, 32],
[ 19, 2],
[ 20, 0],
[ 20, 31],
[ 21, 11],
[ 21, 11],
[ 21, 5],
[ 21, 1],
[ 22, 1],
[ 23, 0],
[ 25, 8],
[ 25, 14],
[ 25, 2],
[ 25, 7],
[ 26, 4],
[ 28, 21],
[ 31, 128],
[ 31, 6],
[ 31, 10],
[ 31, 1],
[ 32, 4],
[ 32, 1],
[ 34, 13],
[ 36, 19],
[ 36, 5],
[ 36, 1],
[ 37, 7],
[ 42, 2],
[ 42, 7],
[ 43, 0],
[ 44, 1],
[ 45, 0],
[ 47, 5],
[ 49, 8],
[ 51, 20],
[ 51, 7],
[ 52, 2],
[ 52, 149],
[ 52, 26],
[ 53, 2],
[ 56, 140],
[ 56, 1],
[ 57, 1],
[ 58, 2],
[ 60, 11],
[ 62, 18],
[ 62, 123],
[ 63, 9],
[ 64, 4],
[ 69, 2],
[ 71, 1],
[ 76, 8],
# add transparency
drinks.plot(kind='scatter', x="beer", y="wine", alpha=0.3);
AF 53
EU 45
AS 44
NA 23
OC 16
SA 12
Name: continent, dtype: int64
continent
Outliers
array([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
1,
1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
2,
3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 5,
5,
6, 6, 6, 7, 9, 11, 11, 12, 13, 15, 15, 16,
16,
18, 18, 18, 18, 19, 21, 21, 22, 22, 25, 25, 27,
29,
31, 31, 34, 35, 35, 35, 35, 38, 39, 41, 41, 42,
42,
44, 46, 50, 51, 55, 56, 57, 60, 61, 63, 63, 65,
67,
68, 69, 69, 69, 71, 71, 72, 74, 75, 76, 76, 79,
81,
84, 87, 87, 88, 97, 97, 98, 98, 100, 100, 100, 100,
101,
104, 104, 112, 114, 114, 114, 117, 117, 118, 118, 122, 122,
124,
126, 128, 131, 132, 133, 133, 135, 137, 138, 145, 147, 151,
152,
154, 156, 157, 158, 160, 170, 173, 173, 176, 178, 179, 186,
189,
192, 194, 200, 202, 205, 215, 215, 216, 221, 226, 237, 244,
246,
252, 254, 258, 286, 293, 302, 315, 326, 326, 373, 438])
count 193.000000
mean 80.994819
std 88.284312
min 0.000000
25% 4.000000
50% 56.000000
75% 128.000000
max 438.000000
Name: spirit, dtype: float64
Colors Shape
City State Time Year
Reported Reported
1930-06-
0 Ithaca NaN TRIANGLE NY 01 1930
22:00:00
1930-06-
1 Willingboro NaN OTHER NJ 30 1930
20:00:00
1931-02-
2 Holyoke NaN OVAL CO 15 1931
14:00:00
1931-06-
3 Abilene NaN DISK KS 01 1931
13:00:00
1933-04-
New York
4 NaN LIGHT NY 18 1933
Worlds Fair
19:00:00
(80543, 6)
# data structure
ufo.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80543 entries, 0 to 80542
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 City 80496 non-null object
1 Colors Reported 17034 non-null object
2 Shape Reported 72141 non-null object
3 State 80543 non-null object
4 Time 80543 non-null datetime64[ns]
5 Year 80543 non-null int64
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 3.7+ MB
# numerical summary
ufo.describe()
Year
count 80543.000000
mean 2004.178737
std 10.602487
min 1930.000000
25% 2001.000000
50% 2007.000000
75% 2011.000000
max 2014.000000
# count the number of ufo reports each year (and sort by year)
ufo.Year.value_counts().sort_index()
1930 2
1931 2
1933 1
1934 1
1935 1
...
2010 4154
2011 5089
2012 7263
2013 7003
2014 5382
Name: Year, Length: 82, dtype: int64
Grouped Box Plots: show one box plot for each group
# remainder: boxplot of beer survings
drinks.beer.plot(kind='box');
Assorted Functionality
# saving a plot to a file
drinks.beer.plot(kind='hist', bins=20, title="Histogram of Beer
Survings")
plt.xlabel("Beer Survings")
plt.ylabel("Freequency")
plt.savefig("beer_survings.png") # .png, .tiff, .pdf, .jpeg
['Solarize_Light2',
'_classic_test_patch',
'bmh',
'classic',
'dark_background',
'fast',
'fivethirtyeight',
'ggplot',
'grayscale',
'seaborn',
'seaborn-bright',
'seaborn-colorblind',
'seaborn-dark',
'seaborn-dark-palette',
'seaborn-darkgrid',
'seaborn-deep',
'seaborn-muted',
'seaborn-notebook',
'seaborn-paper',
'seaborn-pastel',
'seaborn-poster',
'seaborn-talk',
'seaborn-ticks',
'seaborn-white',
'seaborn-whitegrid',
'tableau-colorblind10']
By Jubayer Hossain
© Copyright 2020.