pandas读取数据
import pandas
# 读取csv文件
food_info = pandas.read_csv("food_info.csv")
# pandas核心数据类型,dataframe
print(type(food_info))
# 列出特征的类型,字符用object表示,float64,int64
print (food_info.dtypes)
# 查看前五行数据
first_rows = food_info.head()
print (first_rows)
# 查看最后5行
food_info.tail()
# 查看前三行
print(food_info.head(3))
# 查看列的名称
print (food_info.columns)
# 查看表的形状大小
print (food_info.shape)
索引,根据函数来查看
# 取第一行数据
food_info.loc[0]
# 切片取数据
food_info.loc[3:6]
two_five_ten = [2,5,10]
food_info.loc[two_five_ten]
# 按列取数据
ndb_col = food_info["NDB_No"]
col_name = "NDB_No"
ndb_col = food_info[col_name]
# 取多列
columns = ["Zinc_(mg)", "Copper_(mg)"]
zinc_copper = food_info[columns]
#取出表中以(g)结尾的列
# 列名的列表集合
col_names = food_info.columns.tolist()
#print col_names
gram_columns = []
# 遍历列名
for c in col_names:
# 以(g)结尾
if c.endswith("(g)"):
gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))
数据类型
object - For string values
int - For integer values
float - For float values
datetime - For time values
bool - For Boolean values
print(food_info.dtypes)
对表中的列进行计算
div_1000 = food_info["Iron_(mg)"] / 1000
add_100 = food_info["Iron_(mg)"] + 100
sub_100 = food_info["Iron_(mg)"] - 100
mult_2 = food_info["Iron_(mg)"]*2
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
# 在原表的基础上新添加一列
food_info["Iron_(g)"] = iron_grams
# 计算
#Score=2×(Protein_(g))−0.75×(Lipid_Tot_(g))
weighted_protein = food_info["Protein_(g)"] * 2
weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"]
initial_rating = weighted_protein + weighted_fat
# 对某一列数据进行归一化
# The largest value in the "Energ_Kcal" column.
max_calories = food_info["Energ_Kcal"].max()
# Divide the values in "Energ_Kcal" by the largest value.
normalized_calories = food_info["Energ_Kcal"] / max_calories
normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"].max()
normalized_fat = food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max()
food_info["Normalized_Protein"] = normalized_protein
food_info["Normalized_Fat"] = normalized_fat
# 排序
food_info.sort_values("Sodium_(mg)", inplace=True) # 默认从小到大,升序
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False) # 降序
泰坦尼克号船员获救
分析数据每一列所代表的含义
PassengerId:每个乘客唯一的id号,共891名乘客
Survived:每名乘客最后是否获救,最后的标签值
Pclass:船舱的等级,一个类别的特征
Name:乘客的姓名
Sex:乘客的性别
Age:乘客的年龄
SibSp:兄弟姐妹的数量
Parch:老人和孩子的数量
Ticket:船票编码,意义不大
Fare:船票价格
Cabin:船舱的编号
Embarked:登船口
判断缺失值
import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
# 查看某一列是否有确实值
age = titanic_survival["Age"]
print(age.loc[0:10])
# 查看缺失值
age_is_null = pd.isnull(age) # returns a series of True and False values
print (age_is_null)
# 将缺失值的行取出
age_null_true = age[age_is_null]
print (age_null_true)
# 统计缺失值的长度
age_null_count = len(age_null_true)
print(age_null_count)
# 计算平均值,但不包含缺失值。相当于过滤
good_ages = titanic_survival["Age"][age_is_null == False]
# 求平均,结果一致
correct_mean_age = sum(good_ages) / len(good_ages)
correct_mean_age = titanic_survival["Age"].mean()
# 每个船舱等级的船票价格的平均值
# 一个量与其他量之间的关系
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
# 其他两个量之间的关系
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
# 去除包含NAN的值的行
#specifying axis=1 or axis='columns' will drop any columns that have null values
drop_na_columns = titanic_survival.dropna(axis=1)
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])
# 定位到具体的某一位置的值 行号+列名
row_index_83_age = titanic_survival.loc[83,"Age"]
# 按照某一列排序并且改变索引编号
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
itanic_reindexed = new_titanic_survival.reset_index(drop=True)
# 自定义函数
# 第100行数据
def hundredth_row(column):
# Extract the hundredth item
hundredth_item = column.iloc[99]
return hundredth_item
# Return the hundredth item from each column
hundredth_row = titanic_survival.apply(hundredth_row)
# 每一列缺失值的个数
def not_null_count(column):
column_null = pd.isnull(column)
null = column[column_null]
return len(null)
column_null_count = titanic_survival.apply(not_null_count)
# 数据转换
def which_class(row):
pclass = row['Pclass']
if pd.isnull(pclass):
return "Unknown"
elif pclass == 1:
return "First Class"
elif pclass == 2:
return "Second Class"
elif pclass == 3:
return "Third Class"
classes = titanic_survival.apply(which_class, axis=1)
# 年龄离散化
def generate_age_label(row):
age = row["Age"]
if pd.isnull(age):
return "unknown"
elif age < 18:
return "minor"
else:
return "adult"
age_labels = titanic_survival.apply(generate_age_label, axis=1)
series结构
dataframe中的一行或者一列就是series结构
series结构内又是np.arrary的类型