numpy & pandas笔记

numpy

直接看例子和注释吧

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
import numpy as np

array = np.array([[1,2,3],[4,5,6]]) # 创建一个矩阵
print('number of dim:',array.ndim) # 矩阵的维度
print('shape:',array.shape) # 矩阵的形状
print('size:',array.size) # 矩阵元素个数

'''
number of dim: 2
shape: (2, 3)
size: 6
'''


a = np.array([2,3,4],dtype = np.int) # 指定矩阵的数据类型
print(a.dtype) # 我的电脑是 int32
a = np.array([2,3,4],dtype = np.int64) # 这里还可以填写 float float32 float64 ...
print(a.dtype) # int64

## 创建多维矩阵
a = np.array([[1,2,3,5],[2,4,6,7],[3,5,6,7]])
print(a)

'''
[[1 2 3 5]
[2 4 6 7]
[3 5 6 7]]
'''

## 使用函数自动生成
a = np.zeros((3,4)) # 注意参数内部加括号
print(a)
'''
[[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]]
'''
b = np.empty((2,3)) # 生成空矩阵 其实是接近0的数字
print(b)
'''
[[6.23042070e-307 8.45590538e-307 7.56593017e-307]
[1.11261027e-306 1.69120009e-306 3.56175432e-317]]
'''
c = np.ones((4,5),dtype = np.float64) # 同时指定数据类型
print(c)
'''
[[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]]
'''
d = np.arange(12) # 类似range函数
print(d)
'''
[ 0 1 2 3 4 5 6 7 8 9 10 11]
'''
d = d.reshape((3,4)) # 对矩阵形状进行调整
print(d)
'''
[[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]]
'''
e = np.linspance(1,3,21).reshape((3,7)) # 前两个参数是区间 闭区间 第三个参数是生成数据的个数
print(e)
'''
[[1. 1.1 1.2 1.3 1.4 1.5 1.6]
[1.7 1.8 1.9 2. 2.1 2.2 2.3]
[2.4 2.5 2.6 2.7 2.8 2.9 3. ]]
'''

## 矩阵运算
a = np.array([10,20,30,40])
b = np.arange(4)
print(a,b)
'''
[10 20 30 40] [0 1 2 3]
'''
c = a - b # 矩阵相减
print(c)
'''
[10 19 28 37]
'''
c = b ** 2 # 矩阵元素平方
print(c)
'''
[0 1 4 9]
'''
c = 10 * np.sin(a) # 对a中元素求正弦值 cos tan 也是这种形式
print(c)
'''
[-5.44021111 9.12945251 -9.88031624 7.4511316 ]
'''
c = b.reshape((2,2))
print(c < 3) # 返回矩阵 符合条件的位置显示True
'''
[[ True True]
[ True False]]
'''
b = b.T # 矩阵转置
print(np.dot(a,b)) # 矩阵相乘
# 等价于
print(a.dot(b))
'''
200
'''
print(a*b.T) # 对应分量相乘
'''
[ 0 20 60 120]
'''
d = np.random.random((2,4)) # 生成随机矩阵
print(d)
'''
[[0.3438966 0.94945051 0.93425265 0.47868663]
[0.67721576 0.68745607 0.91757091 0.33878544]]
'''
print(np.sum(d)) # 求和 np.min np.max
'''
5.32731458296
'''
print(np.max(d,axis=0)) # axis 指定运算维度 0 是列 1 是行
'''
[0.69313537 0.70135578 0.49353616 0.84148005]
'''
print(np.argmin(d,axis=1)) # 返回每一行最小值下标
'''
[0 3]
'''

d = np.arange(2,14).reshape((3,4))
print(np.mean(d)) # 计算平均值
'''
7.5
'''
print(np.median(d)) # 计算中位数
'''
7.5
'''
print(np.cumsum(d)) # 计算累加值
'''
[ 2 5 9 14 20 27 35 44 54 65 77 90]
'''
print(np.diff(d)) # 按行计算累差
'''
[[1 1 1]
[1 1 1]
[1 1 1]]
'''
print(np.nonzero(d)) # 返回非零元素的坐标 第一个数组是非零数字的横坐标 第二个数组是非零元素的列坐标
'''
(array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], dtype=int64), array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3], dtype=int64))
'''
print(np.sort(d)) # 排序
'''
[[ 2 3 4 5]
[ 6 7 8 9]
[10 11 12 13]]
'''
print(np.clip(d,2.1,9.6)) # 把小于2.1的数字变成2.1 大于9.6的数字变成9.6 其他保留原格式
'''
[[2.1 3. 4. 5. ]
[6. 7. 8. 9. ]
[9.6 9.6 9.6 9.6]]
'''

## 索引
A = np.arange(3,15)
print(A[3]) # 索引从0开始
'''
6
'''
A = A.reshape((3,4))
print(A[2]) # 索引第3行
'''
[11 12 13 14]
'''
print(A[0][1]) # 索引第0行 第一列
# 等价于
print(A[0,1])
'''
4
'''
print(A[2,:]) # 冒号代表全选 和 列表切片类似
'''
[11 12 13 14]
'''
print(A[0:2,:-1]) # 选取 0 1 行 去掉最后一列的剩余内容
'''
[[3 4 5]
[7 8 9]]
'''
for row in A: # 默认迭代行
print(row)
'''
[3 4 5 6]
[ 7 8 9 10]
[11 12 13 14]
'''
for col in A.T : # 通过转置进行列的迭代
print(col)
'''
[ 3 7 11]
[ 4 8 12]
[ 5 9 13]
[ 6 10 14]
'''
print(A.flatten()) # 转换成一行
'''
[ 3 4 5 6 7 8 9 10 11 12 13 14]
'''
for item in A.flat: # 逐项迭代
print(item)
'''
3
4
5
6
7
8
9
10
11
12
13
14
'''


## 合并
A = np.array([1,1,1])
B = np.array([2,2,2])
print(np.vstack((A,B))) # 上下合并
'''
[[1 1 1]
[2 2 2]]
'''
print(np.hstack((A,B))) # 左右合并
'''
[1 1 1 2 2 2]
'''
print(A.T) # 转置不能改变它的shape
'''
[1 1 1]
'''
print(A[:,np.newaxis]) # 新加维度,使数组变成3×1
'''
[[1]
[1]
[1]]
'''
print(np.append([1,2,4],[[4,5,6],[7,8,9]]))
'''
[1 2 4 4 5 6 7 8 9]
'''
print(np.append([[1,2,4]],[[4,5,6],[7,8,9]],axis=0)) # 注意括号层数
'''
[[1 2 4]
[4 5 6]
[7 8 9]]
'''
print(np.array([1,2,3]).shape,np.array([[1,2,3]]).shape) # 括号层数不同,shape也不同 很多函数要求shape相同
'''
(3,) (1, 3)
'''
a = np.array([[1,2],[3,4]])
b = np.array([[5,6]]) # 注意括号层数
print(np.concatenate((a,b),axis = 0)) # 竖向合并 此方法要求维度数相同
'''
[[1 2]
[3 4]
[5 6]]
'''
print(np.concatenate((a,b.T),axis = 1)) # 横向合并
'''
[[1 2 5]
[3 4 6]]
'''


## 数组分割
A = np.arange(12).reshape((3,4))
print(np.split(A,2,axis=1)) # 把A数组分成2份,在第1维进行操作(还有第0维)
'''
[array([[0, 1],
[4, 5],
[8, 9]]), array([[ 2, 3],
[ 6, 7],
[10, 11]])]
'''
# print(np.split(A,3,axis=1)) 报错 无法把4列分成3份
print(np.array_split(A,3,axis = 1))
'''
[array([[0, 1],
[4, 5],
[8, 9]]), array([[ 2],
[ 6],
[10]]), array([[ 3],
[ 7],
[11]])]
'''
print(np.vsplit(A,3))
'''
[array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8, 9, 10, 11]])]
'''
print(np.hsplit(A,2))
'''
[array([[0, 1],
[4, 5],
[8, 9]]), array([[ 2, 3],
[ 6, 7],
[10, 11]])]
'''

## copy
a = np.arange(4)
b = a
c = a
d = b
a[0] = 11
print(a,b,c,d) # a,b,c,d 完全相同
'''
[11 1 2 3] [11 1 2 3] [11 1 2 3] [11 1 2 3]
'''
print(a is b,b is c,c is d) # a b c d 4个变量指向同一个对象
'''
True True True
'''
b = a.copy
print(a,b) # 值相同
print(a is b) # 对象不同
'''
[11 1 2 3] <built-in method copy of numpy.ndarray object at 0x000002DAD6182300>
False
'''

关于axis [1]:

首先对numpy中axis取值进行说明:一维数组时axis=0,二维数组时axis=0,1,维数越高,则axis可取的值越大,数组n维时,axis=0,1,…,n。为了方便下面的理解,我们这样看待:在numpy中数组都有着[]标记,则axis=0对应着最外层的[],axis=1对应第二外层的[],以此类推,axis=n对应第n外层的[]。

下面开始从axis=0,axis=1这两个例子开始,深入透彻的理解numpy中axis的用法。
axis = 0表示对最外层[]里的最大单位块做块与块之间的运算,同时移除最外层[]:

1
2
3
4
a= np.array([1,2,3])  
a.sum(axis = 0)

>>> 6

因为只有一层[],所以直接对这一层里的最大单位快1,2,3做运算;做完加法后本应是[6],但是移除最外层[]后,[]不存在了,所以返回的是6。

1
2
3
4
a= np.array([[1,2],[3,4]]) 
a.sum(axis = 0)

>>> array([4, 6])

有两层[],最外层[]里的最大单位块分别为[1,2],[3,4],对这两个单位块做块与块之间的运算,[1,2]+[3,4] = [4, 6],做完加法后本应是[[4, 6]],但是移除最外层[]后,原来的两层[]变成一层[],所以返回结果为 [4, 6]。

1
2
3
4
np.array([[[1,2],[3,4]],[[11,12],[13,14]]])
a.sum(axis = 0)

>>> array([[12, 14], [16, 18]])

有三层[],最外层[]里的最大单位块分别为[[1,2],[3,4]],[[11,12],[13,14]],对这两个单位块做块与块之间的运算,[[1,2],[3,4]] + [[11,12],[13,14]] = [[12, 14], [16, 18]];做完加法后本应是[[[12, 14], [16, 18]]],但是移除最外层[]后,原来的三层[]变成两层[],所以返回结果为[[12, 14], [16, 18]];

axis= 1表示对第二外层[]里的最大单位块做块与块之间的运算,同时移除第二外层[]:

1
2
3
4
a= np.array([[1,2],[3,4]]) 
a.sum(axis = 1)

>>> array([3, 7])

有两层[],第二外层[]里的最大单位块有两组(因为有两个第二外层[]),第一组是1,2,第二组是3,4,分别对这两个单位块做块与块之间的运算,第一组结果为1+2=3,第二组结果为3+4=7

做完加法后本应是[[3],[7]],但是==移除第二外层[]==后,原来的两层[]变成一层[],所以返回结果为[3, 7]。

1
2
3
4
np.array([[[1,2],[3,4]],[[11,12],[13,14]]])
a.sum(axis = 1)

>>> array([4,6],[24,26])

有三层[],第二外层[]里的最大单位块有两组(因为有两个第二外层[]),第一组是[1,2],[3,4],第二组是[11,12],[13,14],分别对这两个单位块做块与块之间的运算,第一组结果为[1,2]+[3,4] = [ 4, 6],第二组结果为[11,12]+[13,14] = [24, 26]
做完加法后本应是[[[ 4, 6]], [[24, 26]]],但是==移除第二外层[]==后,原来的三层[]变成两层[],所以返回结果为[[ 4, 6], [24, 26]]

axis = 3,4,5也如此分析

看懂了这些说明,相信你对axis已经有了深入的理解,以后再也不用怕高维数组关于axis的运算了!

pandas

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
import numpy as np
import pandas as pd

s = pd.Series([1,3,5,np.nan,44,6]) # 创建序列
print(s) # 显示行序号
'''
0 1.0
1 3.0
2 5.0
3 NaN
4 44.0
5 6.0
dtype: float64
'''
dates = pd.date_range('20200926',periods = 6) # 创建日期序列
print(dates)
'''
DatetimeIndex(['2020-09-26', '2020-09-27', '2020-09-28', '2020-09-29',
'2020-09-30', '2020-10-01'],
dtype='datetime64[ns]', freq='D')
'''
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d']) # 创建DataFrame 指定行的名称是dates 列的名称是 a b c d
print(df)
'''
a b c d
2020-09-26 -0.785197 1.307849 1.481041 -0.068181
2020-09-27 -0.612014 1.621742 -1.145252 0.382694
2020-09-28 -0.066207 -1.617779 -1.584659 1.078805
2020-09-29 -1.323643 0.289927 0.529782 -0.543563
2020-09-30 1.145636 -1.746915 1.754404 -2.203711
2020-10-01 -0.678938 -1.569018 -0.075303 -0.780225
'''
df2 = pd.DataFrame({'A':1.,'B':pd.Timestamp('20200926'),'C':pd.Series(1,index=list(range(4)),dtype='float32'),'D':np.array([3]*4,dtype='int32'),'E':pd.Categorical(['test','train','test','train']),'F':'foo'}) # 使用字典创建DataFrame
print(df2)
'''
A B C D E F
0 1.0 2020-09-26 1.0 3 test foo
1 1.0 2020-09-26 1.0 3 train foo
2 1.0 2020-09-26 1.0 3 test foo
3 1.0 2020-09-26 1.0 3 train foo
'''
print(df2.dtypes) # 每一列的dtype
'''
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
'''
print(df2.index) # 每一行的序号
'''
Int64Index([0, 1, 2, 3], dtype='int64')
'''
print(df2.columns) # 每一列的名字
'''
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
'''
print(df2.values) # 每一行的值
'''
[[1.0 Timestamp('2020-09-26 00:00:00') 1.0 3 'test' 'foo']
[1.0 Timestamp('2020-09-26 00:00:00') 1.0 3 'train' 'foo']
[1.0 Timestamp('2020-09-26 00:00:00') 1.0 3 'test' 'foo']
[1.0 Timestamp('2020-09-26 00:00:00') 1.0 3 'train' 'foo']]
'''
print(df2.describe()) # describe 方法显示数据的统计特征
'''
A C D
count 4.0 4.0 4.0
mean 1.0 1.0 3.0
std 0.0 0.0 0.0
min 1.0 1.0 3.0
25% 1.0 1.0 3.0
50% 1.0 1.0 3.0
75% 1.0 1.0 3.0
max 1.0 1.0 3.0
'''
print(df2.T) # 转置
'''
0 ... 3
A 1 ... 1
B 2020-09-26 00:00:00 ... 2020-09-26 00:00:00
C 1 ... 1
D 3 ... 3
E test ... train
F foo ... foo
[6 rows x 4 columns]
'''
print(df2.sort_index(axis=1,ascending=False)) # 对索引排序,axis=1 表示排列列标 ,False 表示降序排序
'''
F E D C B A
0 foo test 3 1.0 2020-09-26 1.0
1 foo train 3 1.0 2020-09-26 1.0
2 foo test 3 1.0 2020-09-26 1.0
3 foo train 3 1.0 2020-09-26 1.0
'''
print(df2.sort_values(by='E')) # 按E列的值进行排序
'''
A B C D E F
0 1.0 2020-09-26 1.0 3 test foo
2 1.0 2020-09-26 1.0 3 test foo
1 1.0 2020-09-26 1.0 3 train foo
3 1.0 2020-09-26 1.0 3 train foo
'''


df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df['A'],df.A) # 打印A列
'''
2020-09-26 0
2020-09-27 4
2020-09-28 8
2020-09-29 12
2020-09-30 16
2020-10-01 20
Freq: D, Name: A, dtype: int32 2020-09-26 0
2020-09-27 4
2020-09-28 8
2020-09-29 12
2020-09-30 16
2020-10-01 20
Freq: D, Name: A, dtype: int32
'''
print(df[0:3],df['20200927':'20200930']) # 切片 使用名称时区间右闭
'''
A B C D
2020-09-26 0 1 2 3
2020-09-27 4 5 6 7
2020-09-28 8 9 10 11 A B C D
2020-09-27 4 5 6 7
2020-09-28 8 9 10 11
2020-09-29 12 13 14 15
2020-09-30 16 17 18 19
'''
print(df.loc['20200928']) # 以标签名称选择
'''
A 8
B 9
C 10
D 11
Name: 2020-09-28 00:00:00, dtype: int32
'''
print(df.loc[:,['A','C']]) # 筛选A C 两列
'''
A C
2020-09-26 0 2
2020-09-27 4 6
2020-09-28 8 10
2020-09-29 12 14
2020-09-30 16 18
2020-10-01 20 22
'''
print(df.iloc[3]) # 第三行
'''
A 12
B 13
C 14
D 15
Name: 2020-09-29 00:00:00, dtype: int32
'''
print(df.iloc[3:5,1:3]) # 切片
'''
B C
2020-09-29 13 14
2020-09-30 17 18
'''
print(df.iloc[[1,2,4],1:3]) # 指定序号切片
'''
B C
2020-09-27 5 6
2020-09-28 9 10
2020-09-30 17 18
'''
print(df[df.A > 8]) # 返回A列中大于8 的行数据
'''
A B C D
2020-09-29 12 13 14 15
2020-09-30 16 17 18 19
2020-10-01 20 21 22 23
'''


df.iloc[0,2] = 1111 # 修改值
df.loc['20200926','B'] = 2222
df.D[df.A > 4] = 0 # 把A列大于4的行中D列的数值设置为0
df['F'] = np.nan # 新增一列 值为nan
print(df)
'''
A B C D F
2020-09-26 0 2222 1111 3 NaN
2020-09-27 4 5 6 7 NaN
2020-09-28 8 9 10 0 NaN
2020-09-29 12 13 14 0 NaN
2020-09-30 16 17 18 0 NaN
2020-10-01 20 21 22 0 NaN
'''
df['E'] = pd.Series([1,2,3,4,5,6],index = dates) # 新增一列
print(df)
'''
A B C D F E
2020-09-26 0 2222 1111 3 NaN 1
2020-09-27 4 5 6 7 NaN 2
2020-09-28 8 9 10 0 NaN 3
2020-09-29 12 13 14 0 NaN 4
2020-09-30 16 17 18 0 NaN 5
2020-10-01 20 21 22 0 NaN 6
'''


print(df.dropna(axis=1,how='any')) # 删除(axis=1)列的数据 , 如果这一列中含有NaN ; how=all 时 条件变成这一列数据全部都是NaN
'''
A B C D E
2020-09-26 0 2222 1111 3 1
2020-09-27 4 5 6 7 2
2020-09-28 8 9 10 0 3
2020-09-29 12 13 14 0 4
2020-09-30 16 17 18 0 5
2020-10-01 20 21 22 0 6
'''
df.iloc[1,1] = np.nan
print(df.isnull()) # 判断是否缺失数据
'''
A B C D F E
2020-09-26 False False False False True False
2020-09-27 False True False False True False
2020-09-28 False False False False True False
2020-09-29 False False False False True False
2020-09-30 False False False False True False
2020-10-01 False False False False True False
'''
print(np.any(df.isnull()) == True)
'''
True
'''



# read from
data = pd.read_csv('student.csv')
print(data)

# save to
data.to_pickle('student.pickle')


df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
print(res)
'''
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
'''

# join, ('inner', 'outer')
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d', 'e'], index=[2,3,4])
res = pd.concat([df1, df2], axis=1, join='outer') # 类似于并集
print(res)
'''
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
'''
res = pd.concat([df1, df2], axis=1, join='inner')
print(res)
'''
a b c d b c d e
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
'''

res = df1.append([df2, df3], ignore_index=True)
print(res)
'''
a b c d e
0 0.0 0.0 0.0 0.0 NaN
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
5 NaN 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0 NaN
7 2.0 2.0 2.0 2.0 NaN
8 2.0 2.0 2.0 2.0 NaN
'''

s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
res = df1.append(s1, ignore_index=True)
print(res)
'''
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 2.0 3.0 4.0
'''


left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']})
res = pd.merge(left, right, on='key')
print(res)
'''
key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
'''

# consider two keys
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})

res = pd.merge(left, right, on=['key1', 'key2'], how='inner')
'''
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
'''
# default for how='inner'
# how = ['left', 'right', 'outer', 'inner']
res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(res)
'''
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
'''
# indicator
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
print(df1)
'''
col1 col_left
0 0 a
1 1 b
'''
print(df2)
'''
col1 col_right
0 1 2
1 2 2
2 2 2
'''
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
print(res)
'''
col1 col_left col_right _merge
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
'''
# give the indicator a custom name
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
print(res)
'''
col1 col_left col_right indicator_column
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
'''


# merged by index
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])

# left_index and right_index
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
print(res)
'''
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
'''
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
print(res)
'''
A B C D
K0 A0 B0 C0 D0
K2 A2 B2 C2 D2
'''

# handle overlapping
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
print(res)
'''
k age_boy age_girl
0 K0 1 4
1 K0 1 5
'''


import matplotlib.pyplot as plt

# plot data

# Series
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
data = data.cumsum()
data.plot()

# DataFrame
data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list("ABCD"))
data = data.cumsum()
# plot methods:
# 'bar', 'hist', 'box', 'kde', 'area', scatter', hexbin', 'pie'
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax)

plt.show()

image-20200926131000731

image-20200926131027005

转载声明

[1]:此部分内容转载自https://www.cnblogs.com/cupleo/p/11330373.html