音频处理数据增强实验与代码分享-CFANZ编程社区

对音频(audio)信号做数据增强(Data Augmentation)可以有多重方式，这里通过sox库、soundfile库、librosa库进行实验，希望可以帮助到有需要的人。可用于深度学习音频处理实验的数据预处理。

音高变换增强（Pitch Shift Augmentation）

音高变化增强，是围绕频率轴的±5％范围内的随机滚动。环绕式转换以保留所有信息。

def pitch_shift_spectrogram(wavepath):
    #     """ Shift a spectrogram along the frequency axis in the spectral-domain at
    #     random
    #     """
    wave, sr = sf.read(wavepath)

    # 将波形转换为 spectrogram
    spectrogram = librosa.stft(wave)
    nb_cols = spectrogram.shape[0]
    max_shifts = nb_cols // 20  # around 5% shift
    nb_shifts = np.random.randint(-max_shifts, max_shifts)

    shifted_spectrogram = np.roll(spectrogram, nb_shifts, axis=0)
    # 将平移后的 spectrogram 转换回波形
    shifted_wave = librosa.istft(shifted_spectrogram)
    return shifted_wave, sr

时移增强（Time Shift Augmentation）

时移增强是通过沿时间轴滚动信号来随机移位信号。

def time_shift_spectrogram(wavepath):
    """ Shift a spectrogram along the time axis in the spectral-domain at random
    """
    wave, sr = sf.read(wavepath)

    # 将波形转换为 spectrogram
    spectrogram = librosa.stft(wave)

    nb_cols = spectrogram.shape[1]
    nb_shifts = np.random.randint(0, nb_cols)

    # 对 spectrogram 进行时间轴的随机平移
    shifted_spectrogram = np.roll(spectrogram, nb_shifts, axis=1)

    # 将平移后的 spectrogram 转换回波形
    shifted_wave = librosa.istft(shifted_spectrogram)

    # # 保存平移后的波形为 WAV 文件
    # shifted_wavepath = wavepath[:-4] + "_shifted.wav"
    # sf.write(shifted_wavepath, shifted_wave, sr)

    return shifted_wave, sr

噪声增强（Noise Augmentation）

噪声增强只是在增强信号之上增加一个随机噪声段，阻尼系数限制在0.4下。

def noise_augmentation(wavepath):
    """ Perform noise augmentation of the wave by loading three noise segments
    from the noise_dir and add these on top of the wave with a dampening factor
    of 0.4
    """
    wave, fs = sf.read(wavepath)
    # dampening_factor = 0.4
    # 生成随机阻尼系数
    dampening_factor = np.random.uniform(0.0, 0.4)
    # 生成与音频长度相同的随机噪声
    noise = np.random.randn(len(wave))

    # 将随机噪声混入到音频中
    wave = wave + noise * dampening_factor
    return wave, fs

相同类别增强（Same Class Augmentation）

相同类别的增强，简单将两个音频片段s1和s2，按照一定比例相加。

def same_class_augmentation(wavepath, class_dir):
    """ Perform same class augmentation of the wave by loading a random segment
    from the class_dir and additively combine the wave with that segment.
    """
    sig_paths = glob.glob(os.path.join(class_dir, "*.wav"))

    # print('sig_paths:', sig_paths)

    aug_sig_path = np.random.choice(sig_paths, 1, replace=False)[0]
    print('aug_sig_path:', aug_sig_path)


    aug_sig, fs = sf.read(aug_sig_path)
    wave, fs1 = sf.read(wavepath)

    # 调整aug_sig的长度与wave相同
    aug_sig = interp1d(np.arange(len(aug_sig)), aug_sig)(np.linspace(0, len(aug_sig) - 1, len(wave)))

    alpha = np.random.rand()
    wave = (1.0 - alpha) * wave + alpha * aug_sig

    return wave, fs

变速增强(Time Augmentation)

请注意，需要Python安装sox库，以及Win上下载好并安装sox软件，exe所在文件夹需要加入系统环境变量。

# 变速增强，运用sox库
def speed_aug(input_wav, output_wav, speed_factor=1.2):
    # 创建 SoX 转换器对象
    transformer = sox.Transformer()
    transformer.speed(speed_factor)
    # 设置变速参数
    # speed_factor = 1.2  # 变速因子，大于 1 加快速度，小于 1 减慢速度

    # 执行变速和数据增强转换
    transformer.build(input_wav, output_wav)

代码实操

# -*- coding: utf-8 -*-
# @Time      :   2023/5/27 0027 15:18
# @Author    :   Jason
# -*- coding: utf-8 -*-
# @Time      :   2023/5/26 0026 22:07
# @Author    :   Jason
import glob
import os

import numpy as np
import sox  # py
import soundfile as sf
from scipy.interpolate import interp1d
import librosa
from config import *

# sox库
os.environ['PATH'] = 'G:\Program Files (x86)\sox-14-4-2'


# 相同类别的增强，简单将两个音频片段s1和s2，按照一定比例相加
def same_class_augmentation(wavepath, class_dir):
    """ Perform same class augmentation of the wave by loading a random segment
    from the class_dir and additively combine the wave with that segment.
    """
    sig_paths = glob.glob(os.path.join(class_dir, "*.wav"))

    # print('sig_paths:', sig_paths)

    aug_sig_path = np.random.choice(sig_paths, 1, replace=False)[0]
    print('aug_sig_path:', aug_sig_path)


    aug_sig, fs = sf.read(aug_sig_path)
    wave, fs1 = sf.read(wavepath)

    # 调整aug_sig的长度与wave相同
    aug_sig = interp1d(np.arange(len(aug_sig)), aug_sig)(np.linspace(0, len(aug_sig) - 1, len(wave)))

    alpha = np.random.rand()
    wave = (1.0 - alpha) * wave + alpha * aug_sig

    return wave, fs


# 噪声增强只是在增强信号之上增加一个随机噪声段，阻尼系数为0.4
def noise_augmentation(wavepath):
    """ Perform noise augmentation of the wave by loading three noise segments
    from the noise_dir and add these on top of the wave with a dampening factor
    of 0.4
    """
    wave, fs = sf.read(wavepath)
    # dampening_factor = 0.4
    # 生成随机阻尼系数
    dampening_factor = np.random.uniform(0.0, 0.4)
    # 生成与音频长度相同的随机噪声
    noise = np.random.randn(len(wave))

    # 将随机噪声混入到音频中
    wave = wave + noise * dampening_factor
    return wave, fs


# 时移增强是通过沿时间轴滚动信号来随机移位信号。包裹着移动
def time_shift_spectrogram(wavepath):
    """ Shift a spectrogram along the time axis in the spectral-domain at random
    """
    wave, sr = sf.read(wavepath)

    # 将波形转换为 spectrogram
    spectrogram = librosa.stft(wave)

    nb_cols = spectrogram.shape[1]
    nb_shifts = np.random.randint(0, nb_cols)

    # 对 spectrogram 进行时间轴的随机平移
    shifted_spectrogram = np.roll(spectrogram, nb_shifts, axis=1)

    # 将平移后的 spectrogram 转换回波形
    shifted_wave = librosa.istft(shifted_spectrogram)

    # # 保存平移后的波形为 WAV 文件
    # shifted_wavepath = wavepath[:-4] + "_shifted.wav"
    # sf.write(shifted_wavepath, shifted_wave, sr)

    return shifted_wave, sr


# 音高变化增强，是围绕频率轴的±5％范围内的随机滚动。环绕式转换以保留所有信息
def pitch_shift_spectrogram(wavepath):
    #     """ Shift a spectrogram along the frequency axis in the spectral-domain at
    #     random
    #     """
    wave, sr = sf.read(wavepath)

    # 将波形转换为 spectrogram
    spectrogram = librosa.stft(wave)
    nb_cols = spectrogram.shape[0]
    max_shifts = nb_cols // 20  # around 5% shift
    nb_shifts = np.random.randint(-max_shifts, max_shifts)

    shifted_spectrogram = np.roll(spectrogram, nb_shifts, axis=0)
    # 将平移后的 spectrogram 转换回波形
    shifted_wave = librosa.istft(shifted_spectrogram)
    return shifted_wave, sr


# 变速增强，运用sox库
def speed_aug(input_wav, output_wav, speed_factor=1.2):
    # 创建 SoX 转换器对象
    transformer = sox.Transformer()
    transformer.speed(speed_factor)
    # 设置变速参数
    # speed_factor = 1.2  # 变速因子，大于 1 加快速度，小于 1 减慢速度

    # 执行变速和数据增强转换
    transformer.build(input_wav, output_wav)


# 增加方法如下

if __name__ == "__main__":
    rootPath = 'H:\\Codes\\AudioClassification-Pytorch-master\\dataset\\audio'
    sourceType = ['C', 'M', 'E', '0', 'ru'] # 增强前类别名
    source_list = []
    newlist = ['C_a', 'M_a', 'E_a', '0_a', 'ru_a'] # 增强后类别名
    s = []
    t = []

    for i in range(0, len(sourceType) - 1):
        s = []
        t = []
        sourcePath = os.path.join(rootPath, sourceType[i])
        # print(sourcePath)
        # 获取文件夹中的所有文件和子文件夹列表
        source_list = os.listdir(sourcePath)
        # print(source_list)
        for j in source_list:
            s.append(os.path.join(sourcePath, j))

        print('s: ', s)

        for x in range(len(s)):
            targetPath = os.path.join(rootPath, newlist[i])
            # print(targetPath)
            # target_list = os.listdir(targetPath)
            # print(target_list)

            t.append(os.path.join(targetPath, str(startIdx)) + '~~~~.wav')
            startIdx += 1

        print('t: ', t)
        """
        从这里开始调用函数生成新样本
        """
        # 变速增强
        # for i in range(len(s)): # 1.2
        #     speed_aug(s[i], t[i])
        # for i in range(len(s)):
        #     speed_aug(s[i], t[i], 0.8)
        # for i in range(len(s)):
        #     speed_aug(s[i], t[i], 1.1)
        # for i in range(len(s)):
        #     speed_aug(s[i], t[i], 0.9)

        # 3724-4579
        # 混合两文件增强
        # for k in range(len(s)):
        #     print('-')
        #     print(s[k])
        #     # print(os.path.join('H:\Codes\AudioClassification-Pytorch-master\dataset\\audio', sourceType[i]))
        #     w, fs = same_class_augmentation(s[k], os.path.join('H:\Codes\AudioClassification-Pytorch-master\dataset\\audio',
        #                                                        sourceType[i]))
        #     print('t[k]: ', t[k])
        #     sf.write(t[k], w, fs)
        #     print('-')

        # 4580-5435
        # 5436-6291
        # 加噪增强
        # for k in range(len(s)):
        #     print('-')
        #     print(s[k])
        #     # print(os.path.join('H:\Codes\AudioClassification-Pytorch-master\dataset\\audio', sourceType[i]))
        #     w, fs = noise_augmentation(s[k])
        #     print('t[k]: ', t[k])
        #     sf.write(t[k], w, fs)
        #     print('-')

        # 时移增强
        # 6292-7147~~~
        # for k in range(len(s)):
        #     print('-')
        #     print(s[k])
        #     # print(os.path.join('H:\Codes\AudioClassification-Pytorch-master\dataset\\audio', sourceType[i]))
        #     w, fs = time_shift_spectrogram(s[k])
        #     print('t[k]: ', t[k])
        #     sf.write(t[k], w, fs)
        #     print('-')

        # 音高变化增强
        # 7148-
        for k in range(len(s)):
            print('-')
            print(s[k])
            # print(os.path.join('H:\Codes\AudioClassification-Pytorch-master\dataset\\audio', sourceType[i]))
            w, fs = pitch_shift_spectrogram(s[k])
            print('t[k]: ', t[k])
            sf.write(t[k], w, fs)
            print('-')

        """
        结束
        """

        print('---------1 class end')

增强后如图

音频处理数据增强实验与代码分享_随机噪声