数据分析——从入门到精通(二)-CFANZ编程社区

排序方法

np.sort(a,axis=-1,kind=“quicksort”) 创建a的副本进行数据元素的排列，不影响原有数据
ndarray.sort(axis=-1,kind=“quicksort”) 在数组对象上直接进行排序，会影响原有数据
np.argsort(a) 将元素排序之后，将元素所在的位置(index)返回，返回的是一个索引数组，可以作为行索引去使用
np.partition(a,kth,axis=-1) 部分排序，如果kth是正值，排序结果保证了前kth个元素是最小的，如果kth是负值，排序的结果保证了后kth个元素是最大的。

import numpy as np

 # 注意：randint里面的范围参数是左闭右开，比如设置1到100，它取不到100,要想取100，最值可以设为101
scores = np.random.randint(101,size=(10,5))  # 取0-101随机数，不包括101，10行5列
# 此时scores数组的值是变化的，每运行一次，就会出现不同的值
# 要想固定score数组内的值，可以添加随机种子  np.random.seed(10)  # 随机种子，给定10个数，此例子要给定50个数
scores

[out]:

array([[20, 27, 33, 25, 80],
       [52, 71, 94, 63, 89],
       [84,  8, 20, 36, 18],
       [23, 23, 69, 94, 58],
       [29, 96, 43, 77,  4],
       [ 5, 57, 96, 28, 65],
       [97, 95, 12, 66, 62],
       [32, 54, 91, 10, 58],
       [46, 15, 18, 30, 59],
       [ 7, 47, 41, 94, 12]])

[int]:

#  np.sort(a, axis=-1, kind='quicksort', order=None)     axis=-1 表示维度的最内层，此例子中是给里面的这五个数进行排序
np.sort(scores)

[out]:

array([[20, 25, 27, 33, 80],
       [52, 63, 71, 89, 94],
       [ 8, 18, 20, 36, 84],
       [23, 23, 58, 69, 94],
       [ 4, 29, 43, 77, 96],
       [ 5, 28, 57, 65, 96],
       [12, 62, 66, 95, 97],
       [10, 32, 54, 58, 91],
       [15, 18, 30, 46, 59],
       [ 7, 12, 41, 47, 94]])

[int]:

np.sort(scores,axis=1)   # 二维数组当中的axis值为1和-1，结果都是一样的

[out]:

array([[20, 25, 27, 33, 80],
       [52, 63, 71, 89, 94],
       [ 8, 18, 20, 36, 84],
       [23, 23, 58, 69, 94],
       [ 4, 29, 43, 77, 96],
       [ 5, 28, 57, 65, 96],
       [12, 62, 66, 95, 97],
       [10, 32, 54, 58, 91],
       [15, 18, 30, 46, 59],
       [ 7, 12, 41, 47, 94]])

[int]:

#  axis=0 表示维度的最外层，此例子中是10个一维数组
np.sort(scores,axis=0)

[out]:

array([[ 5,  8, 12, 10,  4],
       [ 7, 15, 18, 25, 12],
       [20, 23, 20, 28, 18],
       [23, 27, 33, 30, 58],
       [29, 47, 41, 36, 58],
       [32, 54, 43, 63, 59],
       [46, 57, 69, 66, 62],
       [52, 71, 91, 77, 65],
       [84, 95, 94, 94, 80],
       [97, 96, 96, 94, 89]])

[int]:

# 创建scores的副本s2
s2 = scores.copy()
s2

[out]:

array([[20, 27, 33, 25, 80],
       [52, 71, 94, 63, 89],
       [84,  8, 20, 36, 18],
       [23, 23, 69, 94, 58],
       [29, 96, 43, 77,  4],
       [ 5, 57, 96, 28, 65],
       [97, 95, 12, 66, 62],
       [32, 54, 91, 10, 58],
       [46, 15, 18, 30, 59],
       [ 7, 47, 41, 94, 12]])

[int]:

scores

[out]:

array([[20, 27, 33, 25, 80],
       [52, 71, 94, 63, 89],
       [84,  8, 20, 36, 18],
       [23, 23, 69, 94, 58],
       [29, 96, 43, 77,  4],
       [ 5, 57, 96, 28, 65],
       [97, 95, 12, 66, 62],
       [32, 54, 91, 10, 58],
       [46, 15, 18, 30, 59],
       [ 7, 47, 41, 94, 12]])

[int]:

# 所有行的第一列  一维
scores[:,0]

[out]:

array([20, 52, 84, 23, 29,  5, 97, 32, 46,  7])

[int]:

# 排序
np.argsort(scores[:,0])  # 排的是所有的第一列， 返回索引的下标，可以理解为行索引
# 5 表示 索引为5的位置，也就是5
# 9 表示 索引为9的位置，也就是7
#...

[out]:

array([5, 9, 0, 3, 4, 7, 8, 1, 2, 6], dtype=int32)

[int]:

scores[[5, 9, 0, 3, 4, 7, 8, 1, 2, 6]]  # 行索引不连续的选择方式  此例子中的以第一列进行排序的，其他列没有排

[out]:

array([[ 5, 57, 96, 28, 65],
       [ 7, 47, 41, 94, 12],
       [20, 27, 33, 25, 80],
       [23, 23, 69, 94, 58],
       [29, 96, 43, 77,  4],
       [32, 54, 91, 10, 58],
       [46, 15, 18, 30, 59],
       [52, 71, 94, 63, 89],
       [84,  8, 20, 36, 18],
       [97, 95, 12, 66, 62]])

[int]:

scores

[out]:

array([[20, 27, 33, 25, 80],
       [52, 71, 94, 63, 89],
       [84,  8, 20, 36, 18],
       [23, 23, 69, 94, 58],
       [29, 96, 43, 77,  4],
       [ 5, 57, 96, 28, 65],
       [97, 95, 12, 66, 62],
       [32, 54, 91, 10, 58],
       [46, 15, 18, 30, 59],
       [ 7, 47, 41, 94, 12]])

[int]:

scores[[0,5,2]]   # 利用行索引，可以取出对应的行  此例子中，分别取出了第一行，第六行，第三行

[out]:

array([[20, 27, 33, 25, 80],
       [ 5, 57, 96, 28, 65],
       [84,  8, 20, 36, 18]])

[int]:

scores([0,2,5,4],[0,2])   # [注意] 不支持行和列同时不连续选择

[out]:

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-25-0ec13b0b855d> in <module>
----> 1 scores([0,2,5,4],[0,2])   # [注意] 不支持行和列同时不连续选择


TypeError: 'numpy.ndarray' object is not callable

[int]:

#  所有的行的第一列到第二列
scores[:,:2]

[out]:

array([[20, 27],
       [52, 71],
       [84,  8],
       [23, 23],
       [29, 96],
       [ 5, 57],
       [97, 95],
       [32, 54],
       [46, 15],
       [ 7, 47]])

练习：如何根据第3列来对一个5*5的矩阵排序？

[int]:

a = np.random.randint(10,size=(5,5))     # 随机产生10以内的数，5行5列
a

[out]:

array([[8, 1, 7, 8, 4],
       [9, 6, 4, 2, 9],
       [9, 5, 9, 3, 7],
       [5, 3, 8, 8, 5],
       [2, 6, 0, 2, 4]])

[int]:

np.argsort(a[:,2])   # 所有的行，第三列排序，得到行索引

[out]:

array([4, 1, 0, 3, 2], dtype=int32)

[int]:

a[np.argsort(a[:,2])]   # 对行索引进行排序

[out]:

array([[2, 6, 0, 2, 4],
       [9, 6, 4, 2, 9],
       [8, 1, 7, 8, 4],
       [5, 3, 8, 8, 5],
       [9, 5, 9, 3, 7]])

[int]:

np.partition(a,2)  # 部分排序  np.partition(a, kth, axis=-1, kind='introselect', order=None)

[out]:

array([[1, 4, 7, 8, 8],
       [2, 4, 6, 9, 9],
       [3, 5, 7, 9, 9],
       [3, 5, 5, 8, 8],
       [0, 2, 2, 6, 4]])

[int]:

np.partition(a,2,axis=-1)  # axis=-1 表示对里面数进行排序     kth=2 确定前两个数是最小的

[out]:

array([[1, 4, 7, 8, 8],
       [2, 4, 6, 9, 9],
       [3, 5, 7, 9, 9],
       [3, 5, 5, 8, 8],
       [0, 2, 2, 6, 4]])

[int]:

np.partition(a,-2,axis=-1)  # kth=-2  确定后两个数是最大的  看行

[out]:

array([[1, 4, 7, 8, 8],
       [2, 4, 6, 9, 9],
       [3, 7, 5, 9, 9],
       [3, 5, 5, 8, 8],
       [2, 0, 2, 4, 6]])

[int]:

np.partition(a,-2,axis=0)  #   kth=-2  确定后两个数是最大的  看列

[out]:

array([[5, 3, 4, 2, 4],
       [2, 1, 0, 2, 4],
       [8, 5, 7, 3, 5],
       [9, 6, 8, 8, 7],
       [9, 6, 9, 8, 9]])

[int]:

scores

[out]:

array([[20, 27, 33, 25, 80],
       [52, 71, 94, 63, 89],
       [84,  8, 20, 36, 18],
       [23, 23, 69, 94, 58],
       [29, 96, 43, 77,  4],
       [ 5, 57, 96, 28, 65],
       [97, 95, 12, 66, 62],
       [32, 54, 91, 10, 58],
       [46, 15, 18, 30, 59],
       [ 7, 47, 41, 94, 12]])

练习：统计每位同学的总成绩，显示排名前3名同学

np.sum(scores,axis=-1) # axis=-1 横着看   
# 185 是第一行的总和  20+27+33+25+80
# 369 是第二行的总和  52+71+94+63+89
# 166 是第三行的总和  84+8+20+36+18

[out]:

array([185, 369, 166, 267, 249, 251, 332, 245, 168, 201])

[int]:

np.sum(scores,axis=1)  # axis=1和axis=1都是一样的，横着看

[out]:

array([185, 369, 166, 267, 249, 251, 332, 245, 168, 201])

[int]:

np.sum(scores,axis=0)  # 竖着看
# 395 是第一列的总和 20+52+84+23+29+5+97+32+46+7
# 493 是第二列的总和  27+71+8+23+96+57+95+54+15+47

[out]:

array([395, 493, 517, 523, 505])

[int]:

np.sum(scores,axis=-1).argsort()   # 横着看的总和值  返回对应的索引

[out]:

array([2, 8, 0, 9, 7, 4, 5, 3, 6, 1], dtype=int32)

[int]:

# 对总和索引进行排序
scores[np.sum(scores,axis=-1).argsort()]

[out]:

array([[84,  8, 20, 36, 18],
       [46, 15, 18, 30, 59],
       [20, 27, 33, 25, 80],
       [ 7, 47, 41, 94, 12],
       [32, 54, 91, 10, 58],
       [29, 96, 43, 77,  4],
       [ 5, 57, 96, 28, 65],
       [23, 23, 69, 94, 58],
       [97, 95, 12, 66, 62],
       [52, 71, 94, 63, 89]])

[int]:

# 对总和索引进行排序，取前三名同学的成绩，也就是最后三位同学
scores[np.sum(scores,axis=-1).argsort()][-3:]

[out]:

array([[23, 23, 69, 94, 58],
       [97, 95, 12, 66, 62],
       [52, 71, 94, 63, 89]])

[int]:

# 对总和索引进行排序，取前三名同学的成绩，也就是最后三位同学 ，进行求和
scores[np.sum(scores,axis=-1).argsort()][-3:].sum(axis=-1)

[out]:

array([267, 332, 369])

[int]:

# 对三名最高成绩反向
scores[np.sum(scores,axis=-1).argsort()][-3:][::-1]

[out]:

array([[52, 71, 94, 63, 89],
       [97, 95, 12, 66, 62],
       [23, 23, 69, 94, 58]])

[int]:

# 对总和索引进行排序，取前三名同学的成绩，也就是最后三位同学 ，进行求和  反向
scores[np.sum(scores,axis=-1).argsort()][-3:][::-1].sum(axis=-1)

[out]:

array([369, 332, 267])