pandas 学习笔记
import pandas as pd
df = pd.DataFrame({'id':[1,2,3],'name':['a','b','c']})
df = df.set_index('id')
df.to_excel(r'路径+保存的excel名称 eg:c:/test.xlsx')
import pandas as pd
people = pd.read_excel('读取的excel文件路径',header=1)
people.shape
people.columns
people.columns=['id','type','name']
people.setindex('id',inplace=True)
people.head()
people.tail()
people = pd.read_excel('读取的excel文件路径',index_col='id')
import pandas as pd
d={'x':100,'y':200,'c':300}
s1 = pd.Series(d)
'''s1.index s1.name s1.data'''
l1 = [100,200,300]
l2 = ['x','y','z']
s2 = pd.Series(l1,index=l2)
s3 = pd.Series([100,200,300],index=['x','y','z'])
s1 = pd.Series([1,2,3],index=[1,2,3],name='A')
s2 = pd.Series([10,20,30],index=[1,2,3],name='B')
s3 = pd.Series([100,200,300],index=[1,2,3],name='C')
df = pd.DataFrame(s1.name:s1,s2.name:s2,s3.name:s3)
df = pd.DataFrame([s1,s2,s3])
print(df)
import pandas as pd
books = pd.read_excel(r'读取路径',skiprows=3,usecols='c:f',index_cols=None,dtype={'id':str})
books['id'].at[0]=100
'''skiprows跳过空行 usecols:读取的区域范围 at索引开始位置为nan 自动设置成浮点类型 dtype可以设置类型 指定列即设置 '''
from datetime import time.timedelta
def add_month(d,md):
yd = md//12
m = d.month+md % 12
if m! = 12:
yd=+m//12
m=m%12
return date(d.year+yd,m,d.day)
start = date[2018,1,1]
for i in books.index:
books['id'].at[i]=i+1
books['date'].at[i]=start+timedelta(days=i)
books['date'].at[i]=date(start.year+i, \
start.month,start.day)
books['date'].at[i]=add_month(start,i)
books.set_index('id',inplace=True)
books.to_excel('保存路径')
''' 同上 '''
import pandas as pd
books =pd.read_excel('读取路径',index_col='id')
books['listprice']=books['listprice'].apply(lambda x:x+2 )
'''apply 后面接的是函数 或函数名'''
import pandas as pd
products =pd.read_excel('读取路径',index_col='id')
products.sort_values(by='要排序的字段名',inplace=True,ascending=False)
'''sort_values by=['字段一','字段二'] 排序 ascending=[对应字段一的排序 true/false ,字段二的.... ]从小到大 升序'''
import pandas as pd
students =pd.read_excel('读取路径',index_col='id')
studnets=students.loc[students.age.apply(lambdaa:18<=a<30)] \
.loc[students['score'].apply(lambda s: 85<=s<=100)]
'''loc 筛选
1.筛选数据的条件通常是通过函数的形式来表达的。因此对于目标excel的筛选条件我们要先定义相应的函数
2.采用lambda表达式可以省略函数定义,式整个代码变得更加简洁
'''
import pandas as pd
import matplotlib.pyplot as plt
students = pd.read_excel(r'读取excel路径')
students.sort_values(by='字段名',inplace=True,ascending=False)
plt.bar(students.Field,students.Number,color='blue')
plt.xticks(students.Field,rotation='90')
plt.xlabel('Field')
plt.ylabel('Number')
plt.title('test柱状图',fontsize=16)
plt.tight_layout()
plt.show()
'''tight_layout:紧凑行布局
rotation 旋转角度
'''
import pandas as pd
import matplotlib.pyplot as plt
students = pd.read_excel(r'读取excel路径')
students.plot.bar(x='Field',y=['2016','2017'],color=['orange','red'])
plt.title('test柱状图',fontsize=16,fontweight='blod')
plt.xlabel('Field',fontweight='bold')
plt.ylabel('Number',fontweight='bold')
ax =plt.gca()
ax.set_xticklabels(students['Field'],rotation=45,ha='right')
f =plt.gcf()
f.subplots_adjust(left=0.2,bottom=0.42)
plt.show()
'''ha :水平对齐 subplots_adjust 调整距离'''
import pandas as pd
import matplotlib.pyplot as plt
users = pd.read_excel('路径')
users['Total'] = users['Oct'] + users['Nov'] + users['Dec']
users.sort_values(by='Total', inplace=True, ascending=False)
print(users)
users.plot.bar(x='Name', y=['Oct', 'Nov', 'Dec'], stacked=True)
plt.tight_layout()
plt.show()
'''stacked 叠加成一个柱 barh 水平的'''
import pandas as pd
import matplotlib.pyplot as plt
students = pd.read_excel('路径', index_col='From')
print(students)
students['2017'].plot.pie(fontsize=8, counterclock=False, startangle=-270)
plt.title('Source of International Students', fontsize=16, fontweight='bold')
plt.ylabel('2017', fontsize=12, fontweight='bold')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
weeks = pd.read_excel('路径', index_col='Week')
print(weeks)
weeks.plot.area(y=['Accessories', 'Bikes', 'Clothing', 'Components'])
plt.title('Sales Trends', fontsize=16, fontweight='bold')
plt.xticks(weeks.index, fontsize=8)
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_columns = 999
homes = pd.read_excel('路径')
print(homes.head())
homes.sqft_living.plot.hist(bins=100)
plt.xticks(range(0, max(homes.sqft_living), 500), fontsize=8, rotation=90)
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_columns = 999
homes = pd.read_excel('路径')
print(homes.corr())
'''corr:相关性'''
import pandas as pd
students = pd.read_excel('路径', sheet_name='Students', index_col='ID')
scores = pd.read_excel('路径', sheet_name='Scores', index_col='ID')
table = students.join(scores, how='left').fillna(0)
table.Score = table.Score.astype(int)
print(table)
'''left 左边填满 没有的数据用nan on=指定联立的字段
sheet_name 表名 join联立表 fillna 指定值把nan数据填充
'''
import pandas as pd
def score_valication(row):
try:
assert 0 <= row.Score <= 100
except:
print(f'#{row.ID}\tstudent {row.Name} has an invalid score {row.Score}')
students = pd.read_excel('路径.xlsx')
students.apply(score_valication, axis=1)
import pandas as pd
employees = pd.read_excel('文件路径.xlsx', index_col='ID')
df = employees['Full Name'].str.split(expand=True,n=2)
employees['First Name'] = df[0]
employees['Last Name'] = df[1]
print(employees)
import pandas as pd
students = pd.read_excel('路径.xlsx', index_col='ID')
row_sum = students[['Test_1', 'Test_2', 'Test_3']].sum(axis=1)
row_mean = students[['Test_1', 'Test_2', 'Test_3']].mean(axis=1)
students['Total'] = row_sum
students['Average'] = row_mean
col_mean = students[['Test_1', 'Test_2', 'Test_3', 'Total', 'Average']].mean()
col_mean['Name'] = 'Summary'
students = students.append(col_mean, ignore_index=True)
print(students)
import pandas as pd
students = pd.read_excel('路径.xlsx')
dupe = students.duplicated(subset='Name')
dupe = dupe[dupe == True]
print(students.iloc[dupe.index])
print("=========")
students.drop_duplicates(subset='Name', inplace=True, keep='last')
print(students)
import pandas as pd
pd.options.display.max_columns = 999
videos = pd.read_excel('C:/Temp/Videos.xlsx', index_col='Month')
table = videos.T
print(table)
import pandas as pd
students1 = pd.read_csv('C:/Temp/Students.csv', index_col='ID')
students2 = pd.read_csv('C:/Temp/Students.tsv', sep='\t', index_col='ID')
students3 = pd.read_csv('C:/Temp/Students.txt', sep='|', index_col='ID')
print(students1)
print(students2)
print(students3)
import pandas as pd
from datetime import date
orders = pd.read_excel('C:/Temp/Orders.xlsx', dtype={'Date': date})
orders['Year'] = pd.DatetimeIndex(orders.Date).year
groups = orders.groupby(['Category', 'Year'])
s = groups['Total'].sum()
c = groups['ID'].count()
pt1 = pd.DataFrame({'Sum': s, 'Count': c})
pt2 = orders.pivot_table(index='Category', columns='Year', values='Total', aggfunc=np.sum)
print(pt1)
print(pt2)
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import linregress
sales = pd.read_excel('C:/Temp/Sales.xlsx', dtype={'Month': str, 'Revenue': float})
print(sales)
slope, intercept, r_value, p_value, std_err = linregress(sales.index, sales.Revenue)
exp = sales.index * slope + intercept
plt.scatter(sales.index, sales.Revenue)
plt.plot(sales.index, exp, color='red')
plt.xticks(sales.index, sales.Month, rotation=90)
plt.show()
常用函数说明
Series.plot方法的函数:
参数 说明
label 用于图例的标签
ax 要在其上进行绘制的matplotlib subplot对象。如果没有设置,则使用当前matplotlib subplot
style 将要传给matplotlib的风格字符串(for example: ‘ko–’)
alpha 图表的填充不透明(0-1)
kind 可以是’line’, ‘bar’, ‘barh’, ‘kde’
logy 在Y轴上使用对数标尺
use_index 将对象的索引用作刻度标签
rot 旋转刻度标签(0-360)
xticks 用作X轴刻度的值
yticks 用作Y轴刻度的值
xlim X轴的界限
ylim Y轴的界限
grid 显示轴网格线
DataFrame还有一些用于对列进行灵活处理的选项,例如,要将所有列都绘制到一个subplot中还是创建各自的subplot。参数如下表:
参数 说明
subplots 将各个DataFrame列绘制到单独的subplot中
sharex 如果subplots=True,则共用同一个X轴,包括刻度和界限
sharey 类似于上
figsize 表示图像大小的元组
title 表示图像标题的字符串
legend 添加一个subplot图例(默认为True)
sort_columns 以字母表顺序绘制各列,默认使用前列顺序
axis=” “修改填充方向 轴 0从上到下 1从左到右
iloc 定位