BTVN1 - Colaboratory
BTVN1 - Colaboratory
import numpy as np
import pandas as pd
import sklearn
import scipy
import matplotlib.pyplot as plt
import statistics
from sklearn.datasets import load_boston
boston = load_boston();
The Boston housing prices dataset has an ethical problem. You can refer to
the documentation of this function for further details.
In this special case, you can fetch the dataset from the original
source::
import pandas as pd
import numpy as np
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
warnings.warn(msg, category=FutureWarning)
x = boston.data
y = boston.target
print("min y: ", np.min(y))
print("max y: ", np.max(y))
print("trung binh cua y: ", np.mean(y))
print("trung vi cua y: ", np.median(y))
print("mode cua y: ", statistics.mode(y))
print("phuong sai cua y: ", np.var(y))
print("do lech chuan cua y: ", np.std(y))
print("he so tuong quan cua y: ", np.cov(y))
min y: 5.0
max y: 50.0
trung binh cua y: 22.532806324110673
trung vi cua y: 21.2
mode cua y: 50.0
phuong sai cua y: 84.41955615616554
do lech chuan cua y: 9.188011545278203
he so tuong quan cua y: 84.58672359409846
#min
min = 1e9
for i in y:
if (i < min):
https://colab.research.google.com/drive/1TunkxkXexb5FlH_g8lO4LhgPqtTAmvmV#scrollTo=xV6gjZIYqnrN&printMode=true 1/4
07/02/2023, 23:27 BTVN1 - Colaboratory
min = i
print(min)
5.0
#max
max = -1e9
for i in y:
if (i > max):
max = i
print(max)
50.0
#mean
print("trung binh cua y: ", sum(y)/len(y))
#median
y.sort()
n = len(y)
if n % 2 == 0:
median = (y[n//2 - 1] + y[n//2]) / 2
else:
median = y[n//2]
print("trung vi cua y: ", median)
from collections import Counter
n = len(y)
data = Counter(y)
get_mode = dict(data)
mode = [k for k, v in get_mode.items() if v == np.max(list(data.values()))]
if len(mode) == n:
get_mode = "no mode found"
else:
get_mode = "mode is / are: " + ', '.join(map(str, mode))
print(get_mode)
#variance
print("phuong sai cua y: ", sum((np.mean(y) - i)**2 for i in y)/len(y))
#standard deviation
import math
print("do lech chuan cua y: ", math.sqrt(sum((np.mean(y) - i)**2 for i in y)/len(y)))
data = pd.DataFrame(boston.data)
data.columns = boston.feature_names
data.head
<bound method NDFrame.head of CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0
.. ... ... ... ... ... ... ... ... ... ...
501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1.0 273.0
502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1.0 273.0
503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1.0 273.0
504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1.0 273.0
505 0.04741 0.0 11.93 0.0 0.573 6.030 80.8 2.5050 1.0 273.0
PTRATIO B LSTAT
0 15.3 396.90 4.98
https://colab.research.google.com/drive/1TunkxkXexb5FlH_g8lO4LhgPqtTAmvmV#scrollTo=xV6gjZIYqnrN&printMode=true 2/4
07/02/2023, 23:27 BTVN1 - Colaboratory
1 17.8 396.90 9.14
2 17.8 392.83 4.03
3 18.7 394.63 2.94
4 18.7 396.90 5.33
.. ... ... ...
501 21.0 391.99 9.67
502 21.0 396.90 9.08
503 21.0 396.90 5.64
504 21.0 393.45 6.48
505 21.0 396.90 7.88
z = data.CRIM
#correlation coefficient
def correlation(x, y):
mean_x = sum(x)/float(len(x))
mean_y = sum(y)/float(len(y))
sub_x = [i-mean_x for i in x]
sub_y = [i-mean_y for i in y]
numerator = sum([sub_x[i]*sub_y[i] for i in range(len(sub_x))])
std_deviation_x = sum([sub_x[i]**2.0 for i in range(len(sub_x))])
std_deviation_y = sum([sub_y[i]**2.0 for i in range(len(sub_y))])
denominator = (std_deviation_x*std_deviation_y)**0.5
cor = numerator/denominator
return cor
print("he so tuong quan (y,z): ", correlation(y,z))
#Histogram
fig = plt.figure(figsize =(10,7))
plt.hist(z, bins=25, color='grey')
plt.title("crime rate")
plt.xlabel("cRIM")
plt.ylabel("frequency")
plt.show()
#Boxplot
plt.boxplot(z)
plt.title("crime rate")
plt.ylabel("crime")
plt.show()
https://colab.research.google.com/drive/1TunkxkXexb5FlH_g8lO4LhgPqtTAmvmV#scrollTo=xV6gjZIYqnrN&printMode=true 3/4
07/02/2023, 23:27 BTVN1 - Colaboratory
Các sản phẩm có tính phí của Colab - Huỷ hợp đồng tại đây
check 0 giây hoàn thành lúc 23:27
https://colab.research.google.com/drive/1TunkxkXexb5FlH_g8lO4LhgPqtTAmvmV#scrollTo=xV6gjZIYqnrN&printMode=true 4/4