示例波形
import torchaudio.functional as F
import torchaudio
import torch
import matplotlib.pyplot as plt
speech, sr = torchaudio.load("/home/wangguisen/projects/voice-keyword-spotting/data/test/关机/0a0a0630f24f11ea9bafac1f6ba08374.wav")
plot_waveform(speech, sample_rate=8000)
加混响扰动
卷积混响是一种用于使清晰的音频听起来像是在不同环境中产生的技术。例如,使用房间脉冲响应 (RIR),我们可以使清晰的语音听起来就像是在会议室里说的一样。这个过程需要 RIR 数据。以下数据来自 VOiCES 数据集,但您也可以自己录制——只需打开麦克风并拍手即可。
rir_raw, rir_sr = torchaudio.load("/home/wangguisen/projects/voice-keyword-spotting/assets/RIR.wav")
plot_waveform(rir_raw, sample_rate=rir_sr)
首先,我们需要清理 RIR。我们提取主脉冲,并根据其功率对其进行归一化。
rir = rir_raw[:, int(rir_sr * 1.01) : int(rir_sr * 1.3)]
rir = rir / torch.linalg.vector_norm(rir, ord=2)
plot_waveform(rir, rir_sr, title="Room Impulse Response")
然后将语音信号与 RIR 进行卷积:
augmented = F.fftconvolve(speech, rir)
plot_waveform(augmented, sr, title="RIR Applied")
加噪声扰动
noise_raw, noise_sr = torchaudio.load("/home/wangguisen/projects/voice-keyword-spotting/assets/noise.wav")
start_index = random.randint(0, noise_raw.shape[1] - 8000)
noise_raw = noise_raw[:, start_index:start_index + 8000]
plot_waveform(noise_raw, sample_rate=noise_sr)
# 20信噪比
snr_dbs = torch.tensor([20])
noisy_speeches = F.add_noise(speech, noise_raw, snr_dbs)
plot_waveform(noisy_speeches, sample_rate=sr)
torchaudio.save("./noisy_applied.wav", noisy_speeches, sample_rate=sr)
加音量扰动
# 创建 Vol 变换对象,增益为 2 倍,增益单位为幅度
# vol_transform = torchaudio.transforms.Vol(gain=2.0, gain_type="amplitude")
vol_transform = torchaudio.transforms.Vol(gain=0.5, gain_type="amplitude")
vol_applied = vol_transform(speech)
plot_waveform(vol_applied, sample_rate=sr)
torchaudio.save("./vol_applied.wav", vol_applied, sample_rate=sr)
加时间扰动
import torch.nn.functional as F
# 先在两侧各补 10% 长度的零
pad_length = int(speech.shape[1] * 0.1)
speech_pad = F.pad(speech, [pad_length, pad_length])
# 计算随机起始位置
offset = torch.randint(low=0,
high=speech_pad.shape[1] - sr + 1,
size=(1,)).item()
# 随机截取子信号
speech_pad = speech_pad.narrow(dim=1, start=offset, length=sr)
plot_waveform(speech_pad, sample_rate=sr)
torchaudio.save("./speech_pad.wav", speech_pad, sample_rate=sr)