数据操作和数据预处理-CFANZ编程社区

import torch
torch.__version__

'1.8.1+cpu'

文章目录

一、数据操作
二、数据预处理

一、数据操作

标量 1d 例如：1.5
向量 2d 例如：[1,2]
矩阵 3d 例如：[[1,2][3,4]]

# 创建一个行向量x
x = torch.arange(12)
x

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

# 访问形状
x.shape

torch.Size([12])

# 改变形状
x.reshape(3,4)

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])

# 元素总个数
x.numel()

# 全0张量 全1张量
torch.zeros((12))

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

torch.zeros((1,2))

tensor([[0., 0.]])

torch.zeros((1,2,3))

tensor([[[0., 0., 0.],
         [0., 0., 0.]]])

torch.ones((1,2,3))

tensor([[[1., 1., 1.],
         [1., 1., 1.]]])

# 随机采样
# 有时我们想从某个概率分布中随机采样来得到张量中每个元素的值。例如，当我们构造数组来作为神经网络中的参数时，我们通常会随机初始化参数的值。以下代码创建一个形状为 (3, 4) 的张量。其中的每个元素都从均值为0、标准差为1的标准高斯（正态）分布中随机采样。

torch.randn((3,4))

tensor([[ 0.2384, -0.6686, -0.6075,  0.7796],
        [-0.7097, -0.8295, -0.4758,  0.3790],
        [-0.5237,  0.9664, -0.0939, -0.5642]])

# list转tensor
torch.tensor([1,2,3])

tensor([1, 2, 3])

# 运算 相应元素操作
x = torch.tensor([1,2,3])
y = torch.tensor([2,2,2])
x+y,x-y,x*y,x/y,x**y

(tensor([3, 4, 5]),
 tensor([-1,  0,  1]),
 tensor([2, 4, 6]),
 tensor([0.5000, 1.0000, 1.5000]),
 tensor([1, 4, 9]))

# 指数操作
torch.exp(x)

tensor([ 2.7183,  7.3891, 20.0855])

# 张量连接
x = torch.arange(12,dtype=torch.float32).reshape((3,4))
y = torch.tensor([[1,2,3,4],[1,2,3,4],[1,2,3,4]])
torch.cat((x,y),dim=0) # 行

tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.],
        [ 1.,  2.,  3.,  4.],
        [ 1.,  2.,  3.,  4.],
        [ 1.,  2.,  3.,  4.]])

torch.cat((x,y),dim=1) # 列

tensor([[ 0.,  1.,  2.,  3.,  1.,  2.,  3.,  4.],
        [ 4.,  5.,  6.,  7.,  1.,  2.,  3.,  4.],
        [ 8.,  9., 10., 11.,  1.,  2.,  3.,  4.]])

x == y

tensor([[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]])

x.sum()

tensor(66.)

# 广播
a = torch.arange(3).reshape((3,1))
b = torch.arange(2).reshape((1,2))
a,b

(tensor([[0],
         [1],
         [2]]),
 tensor([[0, 1]]))

a+b

tensor([[0, 1],
        [1, 2],
        [2, 3]])

# 索引切片
x[-1],x[1:3]

(tensor([ 8.,  9., 10., 11.]),
 tensor([[ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.]]))

x[1,2] = 9
x

tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  9.,  7.],
        [ 8.,  9., 10., 11.]])

x[0:2,:] = 12 
x

tensor([[12., 12., 12., 12.],
        [12., 12., 12., 12.],
        [ 8.,  9., 10., 11.]])

# 节省内存

before = id(y)
y = y + x
id(y) == before

False

# 如果在后续计算中没有重复使用 X，我们也可以使用 X[:] = # X + Y 或 X += Y 来减少操作的内存开销。

a = x.numpy()
a

array([[12., 12., 12., 12.],
       [12., 12., 12., 12.],
       [ 8.,  9., 10., 11.]], dtype=float32)

b = torch.tensor(a)
b

tensor([[12., 12., 12., 12.],
        [12., 12., 12., 12.],
        [ 8.,  9., 10., 11.]])

a = torch.tensor([3.5])
a,a.item(),float(a),int(a)

(tensor([3.5000]), 3.5, 3.5, 3)

二、数据预处理

import os

os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'house_tiny.csv')
with open(data_file, 'w') as f:
    f.write('NumRooms,Alley,Price\n')  # 列名
    f.write('NA,Pave,127500\n')  # 每行表示一个数据样本
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

# 如果没有安装pandas，只需取消对以下行的注释：
# !pip install pandas
import pandas as pd

data = pd.read_csv(data_file)
print(data)

NumRooms Alley   Price
0       NaN  Pave  127500
1       2.0   NaN  106000
2       4.0   NaN  178100
3       NaN   NaN  140000

inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2]
inputs = inputs.fillna(inputs.mean())
print(inputs)

NumRooms Alley
0       3.0  Pave
1       2.0   NaN
2       4.0   NaN
3       3.0   NaN

inputs = pd.get_dummies(inputs, dummy_na=True)
print(inputs)

NumRooms  Alley_Pave  Alley_nan
0       3.0           1          0
1       2.0           0          1
2       4.0           0          1
3       3.0           0          1

import torch

X, y = torch.tensor(inputs.values), torch.tensor(outputs.values)
X, y

(tensor([[3., 1., 0.],
         [2., 0., 1.],
         [4., 0., 1.],
         [3., 0., 1.]], dtype=torch.float64),
 tensor([127500, 106000, 178100, 140000]))