Pandas 的分组统计功能可以按某一列的内容对数据行进行分组,并对其应用统计函数,比如求和,平均数,中位数,标准差等等…
举例来说,用 .groupby() 方法,我们可以对下面这数据表按 'Company' 列进行分组,并用 .mean() 求每组的平均值:
import pandas as pd
import numpy as np
d={'Company':['Google','Google','Google','Oracle','Oracle','Oracle','Twitter','Twitter','Twitter'],
'Person':['Peter','Sam','Ann','Cindy','Tom','Lee','Gim','Will','Bob'], 'Sale':[200,300,275,250,270,300,350,220,250],
'Wage':[4000,3870,3900,3500,3600,3870,3650,3220,3660]}
df=pd.DataFrame(d)
df
df.groupby('Company').mean()#结果中,Sale列就变成每个公司的分组平均数了。
type(df.groupby('Company'))
df.groupby('Company')['Wage'].mean()
df.groupby('Person').mean()
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.count.html
axis : {0 or ‘index’, 1 or ‘columns’}, default 0
If 0 or ‘index’ counts are generated for each column . If 1 or ‘columns’ counts are generated for each row.
level : int or str, optional
If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a DataFrame. A str specifies the level name.
numeric_only : boolean, default False
Include only float, int or boolean data.
df.set_index(["Company", "Person"]) #制作一个Multiindex
df.set_index(["Company", "Person"]).count(level="Company") #对Multiindex的某一个level统计
df.set_index(["Person", "Company"])
df.set_index(["Person", "Company"]).count(level="Person")
df
df.count() #每一列的非零元素
df.count(axis=1) #每一行的非零元素
df['Company']
df['Company'].count()
Pandas 的 .describe() 方法将对 DataFrame 里的数据进行分析,并一次性生成多个描述性的统计指标,方便用户对数据有一个直观上的认识。
生成的指标,从左到右分别是:计数、平均数、标准差、最小值、25% 50% 75% 位置的值、最大值。
df.groupby('Company').describe()
df.groupby('Company')['Sale'].describe() # 功能与df.groupby('Company').describe()['Sale']
df.groupby('Company').describe()['Sale']
df.groupby('Company').describe().transpose()
df.groupby('Company')['Sale'].describe().transpose()
df.groupby('Company').describe().transpose()['Google']
df1=pd.DataFrame({'A':['A0','A1','A2'],'B':['B0','B1','B2'],'C':['C1','C2','C3']},index=[0,1,2])
df2=pd.DataFrame({'A':['A3','A4','A5'],'B':['B3','B4','B5'],'C':['C3','C4','C5']},index=[3,4,5])
df3=pd.DataFrame({'A':['A6','A7','A8'],'B':['B6','B7','B8'],'C':['C6','C7','C8']},index=[6,7,8])
pd.concat([df1,df2,df3]) #[df1,df2,df3]
pd.concat([df1,df2,df3],axis=0)
pd.concat([df1,df2,df3],axis=1)
#注意,这里出现了一大堆空值。因为我们用来堆叠的3个 DataFrame 里,有许多索引是没有对应数据的。
#因此,当你使用 pd.concat() 的时候,一定要注意堆叠方向的坐标轴(行或列)含有所需的所有数据。
df1=pd.DataFrame({'A':['A0','A1','A2'],'B':['B0','B1','B2'],'C':['C1','C2','C3']})
df2=pd.DataFrame({'A':['A3','A4','A5'],'B':['B3','B4','B5'],'C':['C3','C4','C5']})
df3=pd.DataFrame({'A':['A6','A7','A8'],'B':['B6','B7','B8'],'C':['C6','C7','C8']})
pd.concat([df1,df2,df3],axis=0) #[df1,df2,df3]
pd.concat([df1,df2,df3],axis=1) #[df1,df2,df3]
使用 pd.merge() 函数,能将多个 DataFrame 归并在一起,它的合并方式类似合并 SQL 数据表的方式。
归并操作的基本语法是 pd.merge(left_dataframe, right_dataframe, how='inner', on='Key') 。其中 :
merge_df1=pd.DataFrame({'key':['k1','k2','k3'],'A':['A1','A2','A3'],'B':['B1','B2','B3']})
merge_df2=pd.DataFrame({'key':['k1','k2','k3'],'C':['C1','C2','C3'],'D':['D1','D2','D3']})
pd.merge(merge_df1,merge_df2,how='inner',on='key')
merge_df1=pd.DataFrame({'key1':['k1','k2','k3'],'key2':['k4','k5','k6'],'A':['A1','A2','A3'],'B':['B1','B2','B3']})
merge_df2=pd.DataFrame({'key1':['k1','k2','k3'],'key2':['k4','k5','k6'],'C':['C1','C2','C3'],'D':['D1','D2','D3']})
pd.merge(merge_df1,merge_df2,how='inner',on=['key1','key2'])
merge_df1=pd.DataFrame({'key1':['k1','k3','k4'],'A':['A1','A2','A3'],'B':['B1','B2','B3']})
merge_df2=pd.DataFrame({'key1':['k1','k2','k3'],'C':['C1','C2','C3'],'D':['D1','D2','D3']})
pd.merge(merge_df1,merge_df2,how='outer',on='key1')
join_df1=pd.DataFrame({'A':['A1','A2','A3'],'B':['B1','B2','B3']})
join_df2=pd.DataFrame({'C':['C1','C2','C3'],'D':['D1','D2','D3']})
join_df1.join(join_df2)
join_df1=pd.DataFrame({'A':['A1','A2','A3'],'B':['B1','B2','B3']},index=['key1','key2','key3'])
join_df2=pd.DataFrame({'C':['C1','C2','C3'],'D':['D1','D2','D3']},index=['key1','key3','key4'])
join_df1.join(join_df2)
join_df1=pd.DataFrame({'A':['A1','A2','A3'],'B':['B1','B2','B3']},index=['key1','key2','key3'])
join_df2=pd.DataFrame({'C':['C1','C2','C3'],'D':['D1','D2','D3']},index=['key1','key3','key4'])
join_df1.join(join_df2,how='outer')
不重复的值,在一个 DataFrame 里往往是独一无二,与众不同的。找到不重复的值,在数据分析中有助于避免样本偏差。在 Pandas 里,主要用到 3 种方法:
join_df1=pd.DataFrame({'A':['A1','A2','A3','A1'],'B':['B1','B2','B3','B4']})
join_df1.head()
join_df1['A'].unique()
join_df1['B'].nunique()
join_df1['A'].nunique()
join_df1['A'].value_counts()
join_df1.value_counts()# DataFrame没有这个属性 只有Series有
type(join_df1['A'])
# suppose we have a function to calculate the square of a given value
def square(x):
return x*x
# we can call the self-defined function with .apply()
df_apply=pd.DataFrame({'Company':['Google','Google','Google','Oracle','Oracle','Oracle','Twitter','Twitter','Twitter'],
'Person':['Peter','Sam','Ann','Cindy','Tom','Lee','Gim','Will','Bob'], 'Sale':[20,30,27.5,25,27,30,35,22,25]})
df_apply['Sale'].apply(square)
df_apply['Company'].apply(len) #Company列的内容字符长度
df_apply['Sale'].apply(lambda x: x*x)
df_apply.index
df_apply.columns
df_apply.sort_values('Sale')
df_apply#df_apply.set_index('Person')
df_apply.sort_values('Sale')
type(df_apply.sort_values('Sale'))
df_apply.sort_values('Sale').loc['Will']
df_apply.sort_values('Sale').loc['Will','Company']
df_apply.isnull()
df_apply['Company'].isnull()
data={'A':['Dog','Dog','Dog','Goat','Goat','Goat'],
'B':['Brown','Brown','Black','Brown','Brown','Black'],
'C':['x','y','x','y','x','y'],
'D':[1,3,2,4,5,3]}
df_pivot=pd.DataFrame(data)
df_pivot
Pandas 数据透视表的语法是 .pivot_table(data, values='', index=[''], columns=['']) ,
pd.pivot_table(df_pivot,values='D',index=['A','B'],columns='C')
或者也可以直接调用 df 对象的方法:
df_pivot.pivot_table(values='D',index=['A','B'],columns='C')
df_pivot.to_csv('New_DataFrame.csv') # See problem if the index doesn't be ignored
df_pivot.to_csv('New_DataFrame_2.csv',index=False)# if you don't want the index 0,1,2..
pd.read_csv() 就能将 CSV 文件里的数据转换成 DataFrame 对象:
import os
os.chdir("/home/junyao/文档/pandas_practice")
new_df=pd.read_csv('New_DataFrame_2.csv')
new_df
new_df1=pd.read_csv('New_DataFrame.csv')
new_df1
df_pivot_new = df_pivot.set_index('A')
df_pivot_new
df_pivot_new.to_excel('New_DataFrame.xlsx',sheet_name='Sheet 1')
new_df_pivot_new=pd.read_excel('New_DataFrame.xlsx')
new_df_pivot_new
df_html=pd.read_html('https://en.wikipedia.org/wiki/Udacity')
df_html[0]
df_html[1]