pandas 学习笔记

## 第一节:创建文件
#-*-coding:utf-8-*-
#导入pandas
import pandas as pd
#创建数据类的dataframe
df = pd.DataFrame({'id':[1,2,3],'name':['a','b','c']})
#设置索引列
df = df.set_index('id')
#转成excel 数据表保存
df.to_excel(r'路径+保存的excel名称 eg:c:/test.xlsx')
## 第二节:读取文件
#-*-coding:utf-8-*-
#导入pandas
import pandas as pd
#header 表示列名
people = pd.read_excel('读取的excel文件路径',header=1)

#告诉我们几行几列
people.shape
#列
people.columns
people.columns=['id','type','name']
#设置索引列 不改变原来的
people.setindex('id',inplace=True)
#head里不写默认看前五行
people.head()
#末尾几行
people.tail()

#再次读取时又会自动生成index 索引值,所以读取时要指明
people = pd.read_excel('读取的excel文件路径',index_col='id')
#第三节:行、列、单元格
#-*-coding:utf-8-*-
#导入pandas
import pandas as pd
#方法一:
d={'x':100,'y':200,'c':300}
#生成一个序列对象
s1 = pd.Series(d)
'''s1.index  s1.name  s1.data'''

#方法二:
l1 = [100,200,300]
l2 = ['x','y','z']
s2 = pd.Series(l1,index=l2)

#方法三:
s3 = pd.Series([100,200,300],index=['x','y','z'])

#实例
#index 对齐 没有的用nan 自动填充补位
s1 = pd.Series([1,2,3],index=[1,2,3],name='A')
s2 = pd.Series([10,20,30],index=[1,2,3],name='B')
s3 = pd.Series([100,200,300],index=[1,2,3],name='C')
df = pd.DataFrame(s1.name:s1,s2.name:s2,s3.name:s3)
#如果以列表添加,行、列就会相反
df = pd.DataFrame([s1,s2,s3])
print(df)
#第四节:数字区域读取、填充数字
#-*-coding:utf-8-*-
#导入pandas
import pandas as pd

books = pd.read_excel(r'读取路径',skiprows=3,usecols='c:f',index_cols=None,dtype={'id':str})
books['id'].at[0]=100
'''skiprows跳过空行  usecols:读取的区域范围  at索引开始位置为nan 自动设置成浮点类型 dtype可以设置类型 指定列即设置  '''
#导入时间所需要的模块
from datetime import time.timedelta
#对于月份的算法
def add_month(d,md):
    yd = md//12
    m = d.month+md % 12
    if m! = 12:
        yd=+m//12
        m=m%12
    return date(d.year+yd,m,d.day)
#初始化日期
start = date[2018,1,1]
for i in books.index:
    #改的series
    books['id'].at[i]=i+1
    #直接改dataframe
    #books.at[i,'id']=i+1
    books['date'].at[i]=start+timedelta(days=i)
    books['date'].at[i]=date(start.year+i, \
    start.month,start.day)
    #添加月份
    books['date'].at[i]=add_month(start,i)
books.set_index('id',inplace=True)
books.to_excel('保存路径')
#第五节:填充日期序列
#-*-coding:utf-8-*-
#导入pandas
''' 同上 '''
#第六节:函数填充,计算列
#-*-coding:utf-8-*-
#导入pandas
import pandas as pd
books =pd.read_excel('读取路径',index_col='id')
#for i in range(5,16):
#for i in books.index:
    #books['price'].at[i]=books['listprice'].at[i]*\
    #books['discount'].at[i]

books['listprice']=books['listprice'].apply(lambda x:x+2 )
'''apply 后面接的是函数 或函数名'''
#第七节:排序,多重排序
#-*-coding:utf-8-*-
#导入pandas
import pandas as pd
products =pd.read_excel('读取路径',index_col='id')
products.sort_values(by='要排序的字段名',inplace=True,ascending=False)
'''sort_values  by=['字段一','字段二']  排序   ascending=[对应字段一的排序 true/false ,字段二的.... ]从小到大 升序'''
#第八节:数据筛选、过滤
#-*-coding:utf-8-*-
#导入pandas
import pandas as pd
students =pd.read_excel('读取路径',index_col='id')
#筛选
studnets=students.loc[students.age.apply(lambdaa:18<=a<30)] \
.loc[students['score'].apply(lambda s: 85<=s<=100)]
'''loc 筛选
1.筛选数据的条件通常是通过函数的形式来表达的。因此对于目标excel的筛选条件我们要先定义相应的函数
2.采用lambda表达式可以省略函数定义,式整个代码变得更加简洁
'''
#第九节:柱状图
#-*-coding:utf-8-*-
#导入pandas
import pandas as pd
#画图
import matplotlib.pyplot as plt
students = pd.read_excel(r'读取excel路径')
#进行排序
students.sort_values(by='字段名',inplace=True,ascending=False)

#students.plot.bar(x='Field',y='Number',color='pink',title='test柱状图')

plt.bar(students.Field,students.Number,color='blue')
plt.xticks(students.Field,rotation='90')
plt.xlabel('Field')
plt.ylabel('Number')
plt.title('test柱状图',fontsize=16)
plt.tight_layout()
plt.show()
'''tight_layout:紧凑行布局
   rotation 旋转角度
'''
#第十节:分组柱图,深度优化
#-*-coding:utf-8-*-
import pandas as pd
import matplotlib.pyplot as plt
students = pd.read_excel(r'读取excel路径')
students.plot.bar(x='Field',y=['2016','2017'],color=['orange','red'])
plt.title('test柱状图',fontsize=16,fontweight='blod')
plt.xlabel('Field',fontweight='bold')
plt.ylabel('Number',fontweight='bold')
ax =plt.gca()
ax.set_xticklabels(students['Field'],rotation=45,ha='right')

f =plt.gcf()
f.subplots_adjust(left=0.2,bottom=0.42)
#plt.tight_layout()
plt.show()
'''ha :水平对齐  subplots_adjust 调整距离'''
#第十一节:叠加柱状图,水平柱状图
#-*-coding:utf-8-*-
import pandas as pd
import matplotlib.pyplot as plt

users = pd.read_excel('路径')
users['Total'] = users['Oct'] + users['Nov'] + users['Dec']
users.sort_values(by='Total', inplace=True, ascending=False)
print(users)

users.plot.bar(x='Name', y=['Oct', 'Nov', 'Dec'], stacked=True)
plt.tight_layout()
plt.show()
'''stacked 叠加成一个柱   barh 水平的'''
#第十二节:饼图
#-*-coding:utf-8-*-
import pandas as pd
import matplotlib.pyplot as plt
#绘制饼图是针对一个seriers
students = pd.read_excel('路径', index_col='From')
print(students)
#绘制饼图时,需要加上:plt.axis('equal'),确保饼图是一个正圆,否则为椭圆
#counterclock:逆时针
students['2017'].plot.pie(fontsize=8, counterclock=False, startangle=-270)
plt.title('Source of International Students', fontsize=16, fontweight='bold')
plt.ylabel('2017', fontsize=12, fontweight='bold')
plt.show()
#第十三节:折线趋势图,叠加区域图
#-*-coding:utf-8-*-
import pandas as pd
import matplotlib.pyplot as plt

weeks = pd.read_excel('路径', index_col='Week')
print(weeks)
#area 叠加区域图
# weeks.plot(y=['Accessories', 'Bikes', 'Clothing', 'Components'])
weeks.plot.area(y=['Accessories', 'Bikes', 'Clothing', 'Components'])
plt.title('Sales Trends', fontsize=16, fontweight='bold')
plt.xticks(weeks.index, fontsize=8)
plt.show()
#第十四节:散点图,直方图
#-*-coding:utf-8-*-
import pandas as pd
import matplotlib.pyplot as plt
#数据显示最大的列
pd.options.display.max_columns = 999
homes = pd.read_excel('路径')
print(homes.head())
# print(homes.corr())
#scatter 分散;散点图
# homes.plot.scatter(x='sqft_living', y='price')
# homes.sqft_living.plot.kde()
#hist 直方图
homes.sqft_living.plot.hist(bins=100)
plt.xticks(range(0, max(homes.sqft_living), 500), fontsize=8, rotation=90)
# homes.price.plot.hist(bins=200)
# plt.xticks(range(0, max(homes.price), 100000), fontsize=8, rotation=90)
plt.show()
#第十五节:密度图、数据相关性
#-*-coding:utf-8-*-
import pandas as pd
import matplotlib.pyplot as plt

pd.options.display.max_columns = 999
homes = pd.read_excel('路径')
print(homes.corr())
#同上一节已经注释
'''corr:相关性'''
#第十六节:多表联合
#-*-coding:utf-8-*-
import pandas as pd
# students =pd.read_excel('路径',sheet_name='Students')
# scores = pd.read_excel('路径', sheet_name='Scores')
# table = students.merge(scores, how='left', on='ID').fillna(0)
# table.Score = table.Score.astype(int)
# print(table)

# students = pd.read_excel('路径', sheet_name='Students', index_col='ID')
# scores = pd.read_excel('路径', sheet_name='Scores', index_col='ID')
# table = students.merge(scores, how='left', left_on=students.index, right_on=scores.index).fillna(0)
# table.Score = table.Score.astype(int)
# print(table)

students = pd.read_excel('路径', sheet_name='Students', index_col='ID')
scores = pd.read_excel('路径', sheet_name='Scores', index_col='ID')
table = students.join(scores, how='left').fillna(0)
table.Score = table.Score.astype(int)
print(table)
'''left   左边填满 没有的数据用nan   on=指定联立的字段
   sheet_name 表名    join联立表  fillna 指定值把nan数据填充
'''
#第十七节:数据校验,轴的概念
#-*-coding:utf-8-*-
import pandas as pd


def score_valication(row):
    try:
        assert 0 <= row.Score <= 100
    except:
        print(f'#{row.ID}\tstudent {row.Name} has an invalid score {row.Score}')


students = pd.read_excel('路径.xlsx')
# print(students)
students.apply(score_valication, axis=1)
#第十八节:把一列数据分割成两列
#-*-coding:utf-8-*-
import pandas as pd

employees = pd.read_excel('文件路径.xlsx', index_col='ID')
# n 此处保留两个分割后的字符窜
df = employees['Full Name'].str.split(expand=True,n=2)
#str.upper() 转化成大写
employees['First Name'] = df[0]
employees['Last Name'] = df[1]
print(employees)
#第十九节:求和,求平均,统计导引
#-*- coding:utf-8 -*-
import pandas as pd

students = pd.read_excel('路径.xlsx', index_col='ID')

row_sum = students[['Test_1', 'Test_2', 'Test_3']].sum(axis=1)
#mean 平均值
row_mean = students[['Test_1', 'Test_2', 'Test_3']].mean(axis=1)

students['Total'] = row_sum
students['Average'] = row_mean

col_mean = students[['Test_1', 'Test_2', 'Test_3', 'Total', 'Average']].mean()
col_mean['Name'] = 'Summary'
students = students.append(col_mean, ignore_index=True)
print(students)
#第二十节:定位、消除重复数据
# -*- coding:utf-8 -*-
import pandas as pd

students = pd.read_excel('路径.xlsx')
#subset消除重复的列  duplicated重复的  keep='first' 保留先出现重复的数据
dupe = students.duplicated(subset='Name')
dupe = dupe[dupe == True]  # dupe = dupe[dupe]
print(students.iloc[dupe.index])
print("=========")
students.drop_duplicates(subset='Name', inplace=True, keep='last')
print(students)
#第二十一节:旋转数据表
# -*- coding:utf-8 -*-
import pandas as pd

pd.options.display.max_columns = 999
videos = pd.read_excel('C:/Temp/Videos.xlsx', index_col='Month')
#数据旋转
# table = videos.transpose()
table = videos.T
print(table)
#第二十二节:读取csv、tsv、txt文件中的数据
# -*- coding:utf-8 -*-
import pandas as pd

students1 = pd.read_csv('C:/Temp/Students.csv', index_col='ID')
students2 = pd.read_csv('C:/Temp/Students.tsv', sep='\t', index_col='ID')
students3 = pd.read_csv('C:/Temp/Students.txt', sep='|', index_col='ID')

print(students1)
print(students2)
print(students3)
#第二十三节:透视表,分组。聚合(group by)
# -*- coding:utf-8 -*-
import pandas as pd
from datetime import date

orders = pd.read_excel('C:/Temp/Orders.xlsx', dtype={'Date': date})
orders['Year'] = pd.DatetimeIndex(orders.Date).year
groups = orders.groupby(['Category', 'Year'])
s = groups['Total'].sum()
c = groups['ID'].count()
pt1 = pd.DataFrame({'Sum': s, 'Count': c})
pt2 = orders.pivot_table(index='Category', columns='Year', values='Total', aggfunc=np.sum)

print(pt1)
print(pt2)
#第二十四节:线性回归,数据预测
# -*- coding:utf-8 -*-
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import linregress

sales = pd.read_excel('C:/Temp/Sales.xlsx', dtype={'Month': str, 'Revenue': float})
print(sales)

slope, intercept, r_value, p_value, std_err = linregress(sales.index, sales.Revenue)
exp = sales.index * slope + intercept

plt.scatter(sales.index, sales.Revenue)
plt.plot(sales.index, exp, color='red')
plt.xticks(sales.index, sales.Month, rotation=90)
plt.show()
#第二十五、六节:条件格式 与格式化
# -*- coding:utf-8 -*-
#第二十七节:行操作集锦
# -*- coding:utf-8 -*-
#第二十八节:列操作集锦
# -*- coding:utf-8 -*-
#第二十九节:读取数据库
# -*- coding:utf-8 -*-
#第三十节:编写复杂方程
# -*- coding:utf-8 -*-
                             常用函数说明
Series.plot方法的函数:
参数     说明
label     用于图例的标签
ax     要在其上进行绘制的matplotlib subplot对象。如果没有设置,则使用当前matplotlib subplot
style     将要传给matplotlib的风格字符串(for example: ‘ko–’)
alpha     图表的填充不透明(0-1)
kind     可以是’line’, ‘bar’, ‘barh’, ‘kde’
logy     在Y轴上使用对数标尺
use_index     将对象的索引用作刻度标签
rot     旋转刻度标签(0-360)
xticks     用作X轴刻度的值
yticks     用作Y轴刻度的值
xlim     X轴的界限
ylim     Y轴的界限
grid     显示轴网格线

DataFrame还有一些用于对列进行灵活处理的选项,例如,要将所有列都绘制到一个subplot中还是创建各自的subplot。参数如下表:
参数     说明
subplots     将各个DataFrame列绘制到单独的subplot中
sharex     如果subplots=True,则共用同一个X轴,包括刻度和界限
sharey     类似于上
figsize     表示图像大小的元组
title     表示图像标题的字符串
legend     添加一个subplot图例(默认为True)
sort_columns     以字母表顺序绘制各列,默认使用前列顺序

axis=” “修改填充方向 轴 0从上到下  1从左到右
iloc 定位