本文共 23080 字,大约阅读时间需要 76 分钟。
import pandas as pd
# 1查看pandas版本信息pd.__version__
'0.24.2'
# 创建 Series 数据类型# Pandas 中,Series 可以被看作由 1 列数据组成的数据集。# 创建 Series 语法:s = pd.Series(data, index=index),可以通过多种方式进行创建,以下介绍了 3 个常用方法。
# 3.从列表创建Seriesarr = [1,2,3,4]s1 = pd.Series(arr) # 若没有指定索引,默认从0开始s1
0 11 22 33 4dtype: int64
# 4 .从Ndarray创建Seriesimport numpy as npn = np.random.randn(5) # 随机生成一个数组index = ['a','b','c','d','e'] # 指定了索引s2 = pd.Series(n, index=index)s2
a -0.583111b -0.466115c 0.542662d -0.745683e -0.529050dtype: float64
# 5.从字典创建Seriesdic = { 'a':1,'b':2,'c':3,'d':4,'e':5}s3 = pd.Series(dic)s3
a 1b 2c 3d 4e 5dtype: int64
# 6. 修改Series索引print(s1)s1.index=['A','B','C','D']s1
0 11 22 33 4dtype: int64A 1B 2C 3D 4dtype: int64
# 7.纵向拼接s4 = s3.append(s1) # 将s1拼接到s3s4
a 1b 2c 3d 4e 5A 1B 2C 3D 4dtype: int64
print(s4)s4 = s4.drop('e') # 删除索引为e的值s4
a 1b 2c 3d 4e 5A 1B 2C 3D 4dtype: int64a 1b 2c 3d 4A 1B 2C 3D 4dtype: int64
# 8.Series按指定索引修改元素s4['A'] = 100s4
a 1b 2c 3d 4A 100B 2C 3D 4dtype: int64
# 9.按指定索引查找元素s4['B']
2
# 10 Series切片操作s4[:3] #对s4前三个数据访问
a 1b 2c 3dtype: int64
# 11 加法运算 Series 的加法运算是按照索引计算,如果索引不同则填充为 NaN(空值)。s4.add(s3)
A NaNB NaNC NaND NaNa 2.0b 4.0c 6.0d 8.0e NaNdtype: float64
# 13.减法亦是s4.sub(s3)
A NaNB NaNC NaND NaNa 0.0b 0.0c 0.0d 0.0e NaNdtype: float64
# 14.乘法s4.mul(s3)
A NaNB NaNC NaND NaNa 1.0b 4.0c 9.0d 16.0e NaNdtype: float64
# 15 除法s4.div(s3)
A NaNB NaNC NaND NaNa 1.0b 1.0c 1.0d 1.0e NaNdtype: float64
# 16.求中位数print(s4)s4.median()
a 1b 2c 3d 4A 100B 2C 3D 4dtype: int643.0
# 17.求和s4.sum()
119
# 18.19.最大最小值print(s4.max())s4.min()
1001
# 创建 DataFrame 数据类型# 与 Sereis 不同,DataFrame 可以存在多列数据。一般情况下,DataFrame 也更加常用。
# 20. 通过 NumPy 数组创建 DataFramedates = pd.date_range('today', periods=6) # 定义时间序列作为indexnumbers = np.random.randn(6, 4)columns = ['A','B','C','D']df1 = pd.DataFrame(numbers, index=dates, columns=columns)df1
| A | B | C | D |
2019-07-16 09:59:10.131414 | 1.536536 | -1.598355 | -2.354828 | -1.151150 |
2019-07-17 09:59:10.131414 | 0.758288 | 0.143739 | -0.389704 | 0.369642 |
2019-07-18 09:59:10.131414 | -0.612505 | 0.752261 | 0.243023 | -0.110990 |
2019-07-19 09:59:10.131414 | 0.130843 | 1.308658 | 0.765599 | 0.892070 |
2019-07-20 09:59:10.131414 | 1.220489 | -0.415430 | -0.878169 | -0.215298 |
2019-07-21 09:59:10.131414 | -0.098756 | -2.210043 | 0.376714 | 0.521180 |
# 21.通过字典数组创建DataFramedata = { 'animal':['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'], 'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3], 'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1], 'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']df2 = pd.DataFrame(data, index=labels)df2
| animal | age | visits | priority |
a | cat | 2.5 | 1 | yes |
b | cat | 3.0 | 3 | yes |
c | snake | 0.5 | 2 | no |
d | dog | NaN | 3 | yes |
e | dog | 5.0 | 2 | no |
f | cat | 2.0 | 3 | no |
g | snake | 4.5 | 1 | no |
h | cat | NaN | 1 | yes |
i | dog | 7.0 | 2 | no |
j | dog | 3.0 | 1 | no |
#### 22. 查看 DataFrame 的数据类型df2.dtypes
animal objectage float64visits int64priority objectdtype: object
# 23. 预览 DataFrame 的前 5 行数据df2.head() # 默认前5
| animal | age | visits | priority |
a | cat | 2.5 | 1 | yes |
b | cat | 3.0 | 3 | yes |
c | snake | 0.5 | 2 | no |
d | dog | NaN | 3 | yes |
e | dog | 5.0 | 2 | no |
# 24. 查看 DataFrame 的后 3 行数据df2.tail(3)
| animal | age | visits | priority |
h | cat | NaN | 1 | yes |
i | dog | 7.0 | 2 | no |
j | dog | 3.0 | 1 | no |
# 25.查看索引df2.index
Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')
# 26.查看列名df2.columns
Index(['animal', 'age', 'visits', 'priority'], dtype='object')
# 27. 查看数值df2.values
array([['cat', 2.5, 1, 'yes'], ['cat', 3.0, 3, 'yes'], ['snake', 0.5, 2, 'no'], ['dog', nan, 3, 'yes'], ['dog', 5.0, 2, 'no'], ['cat', 2.0, 3, 'no'], ['snake', 4.5, 1, 'no'], ['cat', nan, 1, 'yes'], ['dog', 7.0, 2, 'no'], ['dog', 3.0, 1, 'no']], dtype=object)
# 28.查看统计数据df2.describe()
| age | visits |
count | 8.000000 | 10.000000 |
mean | 3.437500 | 1.900000 |
std | 2.007797 | 0.875595 |
min | 0.500000 | 1.000000 |
25% | 2.375000 | 1.000000 |
50% | 3.000000 | 2.000000 |
75% | 4.625000 | 2.750000 |
max | 7.000000 | 3.000000 |
# 29.转置操作df2.T
| a | b | c | d | e | f | g | h | i | j |
animal | cat | cat | snake | dog | dog | cat | snake | cat | dog | dog |
age | 2.5 | 3 | 0.5 | NaN | 5 | 2 | 4.5 | NaN | 7 | 3 |
visits | 1 | 3 | 2 | 3 | 2 | 3 | 1 | 1 | 2 | 1 |
priority | yes | yes | no | yes | no | no | no | yes | no | no |
# 30.按列排序df2.sort_values(by='age', ascending=True) # 默认升序
| animal | age | visits | priority |
c | snake | 0.5 | 2 | no |
f | cat | 2.0 | 3 | no |
a | cat | 2.5 | 1 | yes |
b | cat | 3.0 | 3 | yes |
j | dog | 3.0 | 1 | no |
g | snake | 4.5 | 1 | no |
e | dog | 5.0 | 2 | no |
i | dog | 7.0 | 2 | no |
d | dog | NaN | 3 | yes |
h | cat | NaN | 1 | yes |
# 31.对DataFrame数据切片df2[1:3]
| animal | age | visits | priority |
b | cat | 3.0 | 3 | yes |
c | snake | 0.5 | 2 | no |
df2['age'] # 32.单列查询
a 2.5b 3.0c 0.5d NaNe 5.0f 2.0g 4.5h NaNi 7.0j 3.0Name: age, dtype: float64
df2.age
a 2.5b 3.0c 0.5d NaNe 5.0f 2.0g 4.5h NaNi 7.0j 3.0Name: age, dtype: float64
# 33.多列查询df2[['age','animal']]
| age | animal |
a | 2.5 | cat |
b | 3.0 | cat |
c | 0.5 | snake |
d | NaN | dog |
e | 5.0 | dog |
f | 2.0 | cat |
g | 4.5 | snake |
h | NaN | cat |
i | 7.0 | dog |
j | 3.0 | dog |
# 34.通过位置查询df2.iloc[1:3]
| animal | age | visits | priority |
b | cat | 3.0 | 3 | yes |
c | snake | 0.5 | 2 | no |
df2.loc['c','age']
0.5
df2.loc[:'f',['age','animal']]
| age | animal |
a | 2.5 | cat |
b | 3.0 | cat |
c | 0.5 | snake |
d | NaN | dog |
e | 5.0 | dog |
f | 2.0 | cat |
# 35.副本拷贝df3 = df2.copy()df3
| animal | age | visits | priority |
a | cat | 2.5 | 1 | yes |
b | cat | 3.0 | 3 | yes |
c | snake | 0.5 | 2 | no |
d | dog | NaN | 3 | yes |
e | dog | 5.0 | 2 | no |
f | cat | 2.0 | 3 | no |
g | snake | 4.5 | 1 | no |
h | cat | NaN | 1 | yes |
i | dog | 7.0 | 2 | no |
j | dog | 3.0 | 1 | no |
# 36.判断是否元素为空df3.isnull() # 空返回True
| animal | age | visits | priority |
a | False | False | False | False |
b | False | False | False | False |
c | False | False | False | False |
d | False | True | False | False |
e | False | False | False | False |
f | False | False | False | False |
g | False | False | False | False |
h | False | True | False | False |
i | False | False | False | False |
j | False | False | False | False |
# 37.添加列num= pd.Series([1,2,3,4,5,6,7,8,9,10], index=df3.index)df3['No.'] = numdf3
| animal | age | visits | priority | No. |
a | cat | 2.5 | 1 | yes | 1 |
b | cat | 3.0 | 3 | yes | 2 |
c | snake | 0.5 | 2 | no | 3 |
d | dog | NaN | 3 | yes | 4 |
e | dog | 5.0 | 2 | no | 5 |
f | cat | 2.0 | 3 | no | 6 |
g | snake | 4.5 | 1 | no | 7 |
h | cat | NaN | 1 | yes | 8 |
i | dog | 7.0 | 2 | no | 9 |
j | dog | 3.0 | 1 | no | 10 |
# 39.通过DataFrame的标签对数据进行修改df3.loc['f','age'] = 1.5df3
| animal | age | visits | priority | No. |
a | cat | 2.5 | 1 | yes | 1 |
b | cat | 3.0 | 3 | yes | 2 |
c | snake | 0.5 | 2 | no | 3 |
d | dog | NaN | 3 | yes | 4 |
e | dog | 5.0 | 2 | no | 5 |
f | cat | 1.5 | 3 | no | 6 |
g | snake | 4.5 | 1 | no | 7 |
h | cat | NaN | 1 | yes | 8 |
i | dog | 7.0 | 2 | no | 9 |
j | dog | 3.0 | 1 | no | 10 |
# 40.求平均操作df3.mean()
age 3.375visits 1.900No. 5.500dtype: float64
# 41.对任意列求和df3['visits'].sum()
19
# 42 将字符串转换为小写字母string = pd.Series([ 'A','B','asDS',np.nan])print(string)string.str.lower()
0 A1 B2 asDS3 NaNdtype: object0 a1 b2 asds3 NaNdtype: object
# 43.转化为大写string.str.upper()
0 A1 B2 ASDS3 NaNdtype: object
# 44.对缺失值进行填充df4 = df3.copy()print(df4)df4.fillna(value=3)
animal age visits priority No.a cat 2.5 1 yes 1b cat 3.0 3 yes 2c snake 0.5 2 no 3d dog NaN 3 yes 4e dog 5.0 2 no 5f cat 1.5 3 no 6g snake 4.5 1 no 7h cat NaN 1 yes 8i dog 7.0 2 no 9j dog 3.0 1 no 10
| animal | age | visits | priority | No. |
a | cat | 2.5 | 1 | yes | 1 |
b | cat | 3.0 | 3 | yes | 2 |
c | snake | 0.5 | 2 | no | 3 |
d | dog | 3.0 | 3 | yes | 4 |
e | dog | 5.0 | 2 | no | 5 |
f | cat | 1.5 | 3 | no | 6 |
g | snake | 4.5 | 1 | no | 7 |
h | cat | 3.0 | 1 | yes | 8 |
i | dog | 7.0 | 2 | no | 9 |
j | dog | 3.0 | 1 | no | 10 |
# 45.删除存在缺失值的行df5 = df3.copy()print(df5)df5.dropna(how='any') # 任何存在nan行的都将被删掉
animal age visits priority No.a cat 2.5 1 yes 1b cat 3.0 3 yes 2c snake 0.5 2 no 3d dog NaN 3 yes 4e dog 5.0 2 no 5f cat 1.5 3 no 6g snake 4.5 1 no 7h cat NaN 1 yes 8i dog 7.0 2 no 9j dog 3.0 1 no 10
| animal | age | visits | priority | No. |
a | cat | 2.5 | 1 | yes | 1 |
b | cat | 3.0 | 3 | yes | 2 |
c | snake | 0.5 | 2 | no | 3 |
e | dog | 5.0 | 2 | no | 5 |
f | cat | 1.5 | 3 | no | 6 |
g | snake | 4.5 | 1 | no | 7 |
i | dog | 7.0 | 2 | no | 9 |
j | dog | 3.0 | 1 | no | 10 |
# 46.按指定列对齐l = pd.DataFrame({ 'key':['foo1','foo2'],'one':[1,2]})r = pd.DataFrame({ 'key':['foo2','foo1'], 'two':[4,5]})print(l)print(r)pd.merge(l,r,on='key') # 按照key列对齐连接,只存在foo2相同,所以最后变成一行
key one0 foo1 11 foo2 2 key two0 foo2 41 foo1 5
| key | one | two |
0 | foo1 | 1 | 5 |
1 | foo2 | 2 | 4 |
# 51.建立一个以2019年每一天为索引,职位随机数的Seriesdti = pd.date_range(start='20190101', end='20191231', freq="D")s = pd.Series(np.random.rand(len(dti)), index=dti)s
2019-01-01 0.0376382019-01-02 0.1468352019-01-03 0.6300112019-01-04 0.2253522019-01-05 0.5494222019-01-06 0.1361732019-01-07 0.9760752019-01-08 0.5818662019-01-09 0.6674772019-01-10 0.6164542019-01-11 0.6250502019-01-12 0.1314152019-01-13 0.5588832019-01-14 0.7492712019-01-15 0.6764462019-01-16 0.0841042019-01-17 0.0730562019-01-18 0.2321862019-01-19 0.2133572019-01-20 0.4576642019-01-21 0.5383372019-01-22 0.7284272019-01-23 0.8993022019-01-24 0.8506092019-01-25 0.7165022019-01-26 0.3193392019-01-27 0.5774552019-01-28 0.1269912019-01-29 0.5274392019-01-30 0.551891 ... 2019-12-02 0.2465042019-12-03 0.1175492019-12-04 0.2239772019-12-05 0.4071092019-12-06 0.9538202019-12-07 0.5839622019-12-08 0.0147352019-12-09 0.0091902019-12-10 0.4963302019-12-11 0.1919812019-12-12 0.0029352019-12-13 0.5301972019-12-14 0.3288302019-12-15 0.0816382019-12-16 0.9222512019-12-17 0.3323892019-12-18 0.0765672019-12-19 0.9062162019-12-20 0.4813112019-12-21 0.0804052019-12-22 0.2915322019-12-23 0.9334242019-12-24 0.4397712019-12-25 0.7385652019-12-26 0.2154012019-12-27 0.8496872019-12-28 0.8610602019-12-29 0.8310742019-12-30 0.9443072019-12-31 0.245717Freq: D, Length: 365, dtype: float64
# 52.统计s中每一个周三对应值的和s[s.index.weekday == 2].sum() # 周一从0开始
28.54901665149845
# 53.统计s中每个月的平均值s.resample('M').mean()
2019-01-31 0.4716942019-02-28 0.5861592019-03-31 0.5152262019-04-30 0.5301702019-05-31 0.4811622019-06-30 0.5455772019-07-31 0.5476922019-08-31 0.4951582019-09-30 0.5471092019-10-31 0.5447062019-11-30 0.4563122019-12-31 0.434590Freq: M, dtype: float64
# 66.条件查找data = { 'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'], 'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3], 'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1], 'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']df = pd.DataFrame(data, index=labels)df[df['age']>3]
| animal | age | visits | priority |
e | dog | 5.0 | 2 | no |
g | snake | 4.5 | 1 | no |
i | dog | 7.0 | 2 | no |
# 68.多重条件查询
df = pd.DataFrame(data, index=labels)df[(df['animal']=='cat')&(df['age']<3)]
| animal | age | visits | priority |
a | cat | 2.5 | 1 | yes |
f | cat | 2.0 | 3 | no |
# 64.按关键字查询df3[df3['animal'].isin(['cat','dog'])]
| animal | age | visits | priority | No. |
a | cat | 2.5 | 1 | yes | 1 |
b | cat | 3.0 | 3 | yes | 2 |
d | dog | NaN | 3 | yes | 4 |
e | dog | 5.0 | 2 | no | 5 |
f | cat | 1.5 | 3 | no | 6 |
h | cat | NaN | 1 | yes | 8 |
i | dog | 7.0 | 2 | no | 9 |
j | dog | 3.0 | 1 | no | 10 |
# 70.按标签名及列名查询df.loc[df2.index[[3,4,8]],['animal','age']]
| animal | age |
d | dog | NaN |
e | dog | 5.0 |
i | dog | 7.0 |
# 71多条件排序df.sort_values(by=['age','visits'], ascending=[False, True]) # age降序,
| animal | age | visits | priority |
i | dog | 7.0 | 2 | no |
e | dog | 5.0 | 2 | no |
g | snake | 4.5 | 1 | no |
b | cat | 3.0 | 3 | yes |
j | dog | 3.0 | 1 | no |
a | cat | 2.5 | 1 | yes |
f | cat | 2.0 | 3 | no |
c | snake | 0.5 | 2 | no |
d | dog | NaN | 3 | yes |
h | cat | NaN | 1 | yes |
# 73分组求和df4.groupby(by='animal').sum()
| age | visits | No. |
animal | | | |
cat | 7.0 | 8 | 17 |
dog | 15.0 | 8 | 28 |
snake | 5.0 | 3 | 10 |
# 数据清洗# 88.缺失值拟合# 在`FilghtNumber`中有数值缺失,其中数值为按 10 增长,补充相应的缺省值使得数据完整,并让数据为 `int` 类型。
df = pd.DataFrame({ 'From_To': ['LoNDon_paris', 'MAdrid_miLAN', 'londON_StockhOlm', 'Budapest_PaRis', 'Brussels_londOn'], 'FlightNumber': [10045, np.nan, 10065, np.nan, 10085], 'RecentDelays': [[23, 47], [], [24, 43, 87], [13], [67, 32]], 'Airline': ['KLM(!)', ' (12)', '(British Airways. )', '12. Air France', '"Swiss Air"']})df
| From_To | FlightNumber | RecentDelays | Airline |
0 | LoNDon_paris | 10045.0 | [23, 47] | KLM(!) |
1 | MAdrid_miLAN | NaN | [] | <Air France> (12) |
2 | londON_StockhOlm | 10065.0 | [24, 43, 87] | (British Airways. ) |
3 | Budapest_PaRis | NaN | [13] | 12. Air France |
4 | Brussels_londOn | 10085.0 | [67, 32] | "Swiss Air" |
df['FlightNumber'] = df['FlightNumber'].interpolate().astype(int)
df
| From_To | FlightNumber | RecentDelays | Airline |
0 | LoNDon_paris | 10045 | [23, 47] | KLM(!) |
1 | MAdrid_miLAN | 10055 | [] | <Air France> (12) |
2 | londON_StockhOlm | 10065 | [24, 43, 87] | (British Airways. ) |
3 | Budapest_PaRis | 10075 | [13] | 12. Air France |
4 | Brussels_londOn | 10085 | [67, 32] | "Swiss Air" |
# 89. 数据列拆分# 其中From_to应该为两独立的两列From和To,将From_to依照_拆分为独立两列建立为一个新表。temp = df.From_To.str.split('_',expand=True)temp.columns = ['From','To']temp
| From | To |
0 | LoNDon | paris |
1 | MAdrid | miLAN |
2 | londON | StockhOlm |
3 | Budapest | PaRis |
4 | Brussels | londOn |
# 90. 字符标准化# 其中注意到地点的名字都不规范(如:londON应该为London)需要对数据进行标准化处理。temp['From'] = temp['From'].str.capitalize()temp['To'] = temp['To'].str.capitalize()
temp
| From | To |
0 | London | Paris |
1 | Madrid | Milan |
2 | London | Stockholm |
3 | Budapest | Paris |
4 | Brussels | London |
# 91. 删除坏数据加入整理好的数据# 将最开始的 From_to 列删除,加入整理好的 From 和 to 列。df = df.drop('From_To', axis=1)df = df.join(temp)df
| FlightNumber | RecentDelays | Airline | From | To |
0 | 10045 | [23, 47] | KLM(!) | London | Paris |
1 | 10055 | [] | <Air France> (12) | Madrid | Milan |
2 | 10065 | [24, 43, 87] | (British Airways. ) | London | Stockholm |
3 | 10075 | [13] | 12. Air France | Budapest | Paris |
4 | 10085 | [67, 32] | "Swiss Air" | Brussels | London |
# 92. 去除多余字符# 如同 airline 列中许多数据有许多其他字符,会对后期的数据分析有较大影响,需要对这类数据进行修正。df['Airline'] = df['Airline'].str.extract( '([a-zA-Z\s]+)',expand=False).str.strip()df
| FlightNumber | RecentDelays | Airline | From | To |
0 | 10045 | [23, 47] | KLM | London | Paris |
1 | 10055 | [] | Air France | Madrid | Milan |
2 | 10065 | [24, 43, 87] | British Airways | London | Stockholm |
3 | 10075 | [13] | Air France | Budapest | Paris |
4 | 10085 | [67, 32] | Swiss Air | Brussels | London |
# 93. 格式规范# 在 RecentDelays 中记录的方式为列表类型,由于其长度不一,这会为后期数据分析造成很大麻烦。# 这里将 RecentDelays 的列表拆开,取出列表中的相同位置元素作为一列,若为空值即用 NaN 代替。delays = df['RecentDelays'].apply(pd.Series)delays.columns = ['delay_{}'.format(n) for n in range(1, len(delays.columns)+1)]df = df.drop('RecentDelays', axis=1).join(delays)df
| FlightNumber | Airline | From | To | delay_1 | delay_2 | delay_3 |
0 | 10045 | KLM | London | Paris | 23.0 | 47.0 | NaN |
1 | 10055 | Air France | Madrid | Milan | NaN | NaN | NaN |
2 | 10065 | British Airways | London | Stockholm | 24.0 | 43.0 | 87.0 |
3 | 10075 | Air France | Budapest | Paris | 13.0 | NaN | NaN |
4 | 10085 | Swiss Air | Brussels | London | 67.0 | 32.0 | NaN |
# 数据预处理# 94. 信息区间划分df=pd.DataFrame({ 'name':['Alice','Bob','Candy','Dany','Ella','Frank','Grace','Jenny'],'grades':[58,83,79,65,93,45,61,88]})
def choice(x): if x> 60: return 1 else:return 0
df.grades = pd.Series(map(lambda x: choice(x), df.grades))
df
| name | grades |
0 | Alice | 0 |
1 | Bob | 1 |
2 | Candy | 1 |
3 | Dany | 1 |
4 | Ella | 1 |
5 | Frank | 0 |
6 | Grace | 1 |
7 | Jenny | 1 |
# 95. 数据去重df = pd.DataFrame({ 'A':[1,2,3,4,5,4,4,57,8]})df.loc[df['A'].shift() != df['A']]
| A |
0 | 1 |
1 | 2 |
2 | 3 |
3 | 4 |
4 | 5 |
5 | 4 |
7 | 57 |
8 | 8 |
# 96. 数据归一化# 有时候,DataFrame 中不同列之间的数据差距太大,需要对其进行归一化处理。# 其中,Max-Min 归一化是简单而常见的一种方式,公式如下:# Y=X−Xmin/Xmax−Xmin
def normalization(df): numberator = df.sub(df.min()) denominator = (df.max()).sub(df.min()) Y = numberator.div(denominator) return Y
df = pd.DataFrame(np.random.random(size=(5,3)))print(df)normalization(df)
0 1 20 0.920675 0.181496 0.4081791 0.016837 0.740842 0.2396252 0.577404 0.503003 0.0774013 0.502584 0.262550 0.0008484 0.817712 0.774605 0.073925
| 0 | 1 | 2 |
0 | 1.000000 | 0.000000 | 1.000000 |
1 | 0.000000 | 0.943074 | 0.586199 |
2 | 0.620207 | 0.542072 | 0.187938 |
3 | 0.537427 | 0.136659 | 0.000000 |
4 | 0.886083 | 1.000000 | 0.179404 |
# 97. Series 可视化%matplotlib inline
ts = pd.Series(np.random.randn(100), index=pd.date_range('today', periods=100))ts = ts.cumsum()print(ts)ts.plot()
2019-07-16 11:14:32.969237 -0.1605272019-07-17 11:14:32.969237 -0.4135022019-07-18 11:14:32.969237 0.4949392019-07-19 11:14:32.969237 -0.1783432019-07-20 11:14:32.969237 -1.2798422019-07-21 11:14:32.969237 -0.5389812019-07-22 11:14:32.969237 -1.9527032019-07-23 11:14:32.969237 -2.3508312019-07-24 11:14:32.969237 -2.6524192019-07-25 11:14:32.969237 -4.9768562019-07-26 11:14:32.969237 -5.5969932019-07-27 11:14:32.969237 -4.8806972019-07-28 11:14:32.969237 -5.9182252019-07-29 11:14:32.969237 -4.7202132019-07-30 11:14:32.969237 -4.0562082019-07-31 11:14:32.969237 -3.5266402019-08-01 11:14:32.969237 -2.2955202019-08-02 11:14:32.969237 -0.3818502019-08-03 11:14:32.969237 -0.0779562019-08-04 11:14:32.969237 0.4418312019-08-05 11:14:32.969237 -1.6246912019-08-06 11:14:32.969237 -1.0843162019-08-07 11:14:32.969237 -2.1341242019-08-08 11:14:32.969237 -1.4773982019-08-09 11:14:32.969237 -2.2991942019-08-10 11:14:32.969237 -2.5016632019-08-11 11:14:32.969237 -3.1907932019-08-12 11:14:32.969237 -4.2370492019-08-13 11:14:32.969237 -4.4772302019-08-14 11:14:32.969237 -4.171017 ... 2019-09-24 11:14:32.969237 -13.5697302019-09-25 11:14:32.969237 -14.6271882019-09-26 11:14:32.969237 -15.4616382019-09-27 11:14:32.969237 -16.1215602019-09-28 11:14:32.969237 -16.5695112019-09-29 11:14:32.969237 -17.9008422019-09-30 11:14:32.969237 -19.1940012019-10-01 11:14:32.969237 -17.9792932019-10-02 11:14:32.969237 -18.6459032019-10-03 11:14:32.969237 -19.2413672019-10-04 11:14:32.969237 -19.2113652019-10-05 11:14:32.969237 -18.0884192019-10-06 11:14:32.969237 -17.7679762019-10-07 11:14:32.969237 -16.2738832019-10-08 11:14:32.969237 -16.7518122019-10-09 11:14:32.969237 -16.4604682019-10-10 11:14:32.969237 -15.5345142019-10-11 11:14:32.969237 -16.0292532019-10-12 11:14:32.969237 -16.6299952019-10-13 11:14:32.969237 -17.1817342019-10-14 11:14:32.969237 -16.1395462019-10-15 11:14:32.969237 -16.2494242019-10-16 11:14:32.969237 -14.7977192019-10-17 11:14:32.969237 -17.1985462019-10-18 11:14:32.969237 -18.1938872019-10-19 11:14:32.969237 -18.1758412019-10-20 11:14:32.969237 -18.0390032019-10-21 11:14:32.969237 -17.8848382019-10-22 11:14:32.969237 -18.9857602019-10-23 11:14:32.969237 -18.987684Freq: D, Length: 100, dtype: float64
# 98. DataFrame 折线图df = pd.DataFrame(np.random.randn(100, 4), index=ts.index, columns=['A','B','C','D'])df =df.cumsum()print(df)df.plot()
A B C D2019-07-16 11:14:32.969237 -2.311551 -2.601142 0.852766 0.7668992019-07-17 11:14:32.969237 -0.879667 -4.293468 -0.039314 0.8228822019-07-18 11:14:32.969237 -1.249910 -5.562160 -0.456214 0.7208132019-07-19 11:14:32.969237 -0.567523 -5.869549 -1.250540 1.2048542019-07-20 11:14:32.969237 0.000393 -3.939871 -1.824283 1.3779182019-07-21 11:14:32.969237 -1.957763 -4.426390 -1.644319 0.4119902019-07-22 11:14:32.969237 -1.863936 -5.952407 -0.678510 0.8828742019-07-23 11:14:32.969237 -2.047160 -6.771213 1.407736 1.7570212019-07-24 11:14:32.969237 -2.230326 -6.520421 3.122783 2.9760792019-07-25 11:14:32.969237 -3.833992 -6.785455 2.087702 4.0750222019-07-26 11:14:32.969237 -4.315307 -8.567182 2.688330 5.3659912019-07-27 11:14:32.969237 -5.248594 -8.344775 3.382635 4.2149692019-07-28 11:14:32.969237 -5.054369 -7.385112 3.765415 5.0666372019-07-29 11:14:32.969237 -2.931733 -7.085015 3.746368 5.7564382019-07-30 11:14:32.969237 -4.190044 -7.517056 3.133894 8.2179032019-07-31 11:14:32.969237 -3.139043 -8.779127 2.402586 7.8600252019-08-01 11:14:32.969237 -1.870986 -8.921735 2.442751 7.9568242019-08-02 11:14:32.969237 -1.947051 -9.726026 2.805189 8.7300092019-08-03 11:14:32.969237 -2.468689 -7.685965 2.295436 6.7956882019-08-04 11:14:32.969237 -2.138392 -7.481845 3.769528 7.0188162019-08-05 11:14:32.969237 -1.521903 -5.906005 2.340666 7.2808662019-08-06 11:14:32.969237 -0.851497 -5.947501 4.279168 6.2295892019-08-07 11:14:32.969237 -0.745985 -6.307143 5.847261 5.6307052019-08-08 11:14:32.969237 -0.459598 -5.138792 4.995194 5.6479152019-08-09 11:14:32.969237 -0.324185 -5.226607 3.466786 4.2925912019-08-10 11:14:32.969237 -0.352415 -5.121374 3.401821 4.9661652019-08-11 11:14:32.969237 1.123371 -4.678556 2.997400 4.7304022019-08-12 11:14:32.969237 1.621475 -4.918931 1.978229 5.8918172019-08-13 11:14:32.969237 0.528799 -4.923886 1.741921 4.0914292019-08-14 11:14:32.969237 0.234260 -6.577139 3.515839 3.965522... ... ... ... ...2019-09-24 11:14:32.969237 2.098830 11.540368 -2.760031 2.0170742019-09-25 11:14:32.969237 1.917497 11.425361 -2.360769 1.5402592019-09-26 11:14:32.969237 1.586440 11.089945 -2.934906 2.0169882019-09-27 11:14:32.969237 2.426352 11.000135 -4.160570 1.6784622019-09-28 11:14:32.969237 2.590117 11.409677 -5.102951 3.1237962019-09-29 11:14:32.969237 2.586017 11.673688 -5.936028 2.1597312019-09-30 11:14:32.969237 5.012078 12.535448 -6.913949 4.0820582019-10-01 11:14:32.969237 3.529943 14.612272 -6.541449 3.1304292019-10-02 11:14:32.969237 3.376133 12.740237 -7.041879 3.0585732019-10-03 11:14:32.969237 3.536676 13.233300 -6.775922 3.5624602019-10-04 11:14:32.969237 5.075667 13.630937 -6.409229 3.4046472019-10-05 11:14:32.969237 4.633807 14.011680 -7.359063 2.5550632019-10-06 11:14:32.969237 4.108268 14.233577 -8.319235 1.7822572019-10-07 11:14:32.969237 5.389960 15.049002 -7.592306 3.0649962019-10-08 11:14:32.969237 4.904890 15.129739 -7.845749 2.1970242019-10-09 11:14:32.969237 2.894357 14.053121 -7.560088 2.1273222019-10-10 11:14:32.969237 2.432563 13.678098 -7.010267 2.5360352019-10-11 11:14:32.969237 1.493160 13.263020 -7.262265 2.9546922019-10-12 11:14:32.969237 2.477873 14.443603 -7.815188 2.4203562019-10-13 11:14:32.969237 1.914146 14.476938 -6.850849 2.9853172019-10-14 11:14:32.969237 1.944343 13.532021 -7.611172 4.7549202019-10-15 11:14:32.969237 2.379594 13.908116 -8.503684 5.2173892019-10-16 11:14:32.969237 1.479926 13.646017 -7.861792 4.7698452019-10-17 11:14:32.969237 3.376088 12.470308 -7.902426 4.7357792019-10-18 11:14:32.969237 3.847433 12.177020 -6.719579 3.1234752019-10-19 11:14:32.969237 3.904511 12.261467 -6.016796 3.4193902019-10-20 11:14:32.969237 3.188237 14.305071 -6.137896 2.9058132019-10-21 11:14:32.969237 4.006034 13.981431 -6.034235 2.4833232019-10-22 11:14:32.969237 4.187015 14.311562 -6.466325 0.5316752019-10-23 11:14:32.969237 4.928834 14.064165 -6.435447 -0.506871[100 rows x 4 columns]
# 99. DataFrame 散点图df = pd.DataFrame({ "revenue": [57, 68, 63, 71, 72, 90, 80, 62, 59, 51, 47, 52], "advertising": [2.1, 1.9, 2.7, 3.0, 3.6, 3.2, 2.7, 2.4, 1.8, 1.6, 1.3, 1.9], "month": range(12) })ax = df.plot.bar('month','revenue',color='yellow')df.plot('month','advertising', secondary_y=True,ax=ax)
转载地址:http://xzhgn.baihongyu.com/