为了对音乐风格进行分类,需要提取音频的一些特征,在本案例中提取了MFCC、频谱质心等特征。Torchaudio中提供了相应的处理功能,不过为了保持本书的一贯性仍然使用了librosa进行处理。所有音频统一取前128帧的特征并组合成一个33维的特征向量,最后将提取的特征和标签数据保存为npy文件。特征提取阶段的代码如下:
#第12章/prepare_data.
import librosa
import re
import numpy as np
root_dir = 'E:/Download/Genres/' #GTZAN数据集根目录
feature_file = 'features.npy' #特征数据文件
label_file = 'labels.npy' #标签数据文件
#数据集中所有音乐风格分类
genre_list = ['blues', #布鲁斯
'classical', #古典音乐
'country', #乡村音乐
'disco', #迪斯科
'hiphop', #嘻哈
'jazz', #爵士乐
'metal', #金属乐
'pop', #流行音乐
'reggae', #雷盖乐
'rock'] #摇滚乐
#训练用音频文件清单
def make_list(start_num, end_num):
#start_num:起始编号;end_num:截止编号
filelist = []
for i in range(len(genre_list)):
genre = genre_list[i]
dir = root_dir + genre + '/'
for id in range(start_num, end_num):
name = dir + genre + '.' + f'{id:05d}'+'.wav'
filelist.append(name)
return filelist
#生成独热代码
def onehot(target):
code = np.zeros((target.shape[0], len(genre_list)))
for i, str in enumerate(target):
index = genre_list.index(str)
code[i, index] = 1
return code
#提取音频特征
def get_features(file_list):
hop_length = 512
total = len(file_list) #文件数量
data = np.zeros((total, 128, 33), dtype=np.float64)
target = []
for i, file in enumerate(file_list):
#加载文件并提取mfcc、频谱质心等特征
y, sr = librosa.load(file, sr=22050)
mfcc = librosa.feature.mfcc(
y=y, sr=sr, hop_length=hop_length, n_mfcc=13
)
centroid = librosa.feature.spectral_centroid(
y=y, sr=sr, hop_length=hop_length
)
chroma = librosa.feature.chroma_stft(
y=y, sr=sr, hop_length=hop_length
)
contrast = librosa.feature.spectral_contrast(
y=y, sr=sr, hop_length=hop_length
)
data[i, :, 0:13] = mfcc.T[0:128, :]
data[i, :, 13:14] = centroid.T[0:128, :]
data[i, :, 14:26] = chroma.T[0:128, :]
data[i, :, 26:33] = contrast.T[0:128, :]
#提取音乐风格名称
genre = re.split("[ /]", file)[3]
target.append(genre)
#输出处理结果
print('Feature extraction for No. %i of %i files finished.'
% (i + 1, total))
return data, np.expand_dims(np.asarray(target), axis=1)
if __name__ == "__main__":
#生成音频文件清单(每种风格前70个)
filelist = make_list(0, 70)
print('Total files: ', len(filelist))
#提取音频特征并保存为npy文件
features, target = get_features(filelist)
labels = onehot(target)
with open(feature_file, "wb") as f:
np.save(f, features)
with open(label_file, "wb") as f:
np.save(f, labels)
print('Feature extraction finished!')
上述代码运行后将生成features.npy和labels.npy两个数据文件供调用。由于每种音乐风格仅提取了前70个文件的特征,因此特征数据的维度为(700,128,33)。
12.3.3 模型及训练
接下来进入模型设计阶段。本案例采用LSTM模型,因而结构比较简单。训练的代码也并不复杂,每批样本数设为35个,这样批次共20批(700/35=20),考虑到处理速度,优先使用GPU进行训练。有关模型和训练过程的代码如下:
#第12章/train.py
import os
import sys
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
#LSTM模型定义
class LSTM(nn.Module):
#模型初始化函数
def __init__(self, input_size, hidden_size, output_size, num_layers):
super(LSTM, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(self.input_size, self.hidden_size,
self.num_layers) #LSTM层
self.fc = nn.Linear(self.hidden_size, output_size) #全连接层
#前向计算函数
def forward(self, input, hidden=None):
out, hidden = self.lstm(input, hidden)
logits = self.fc(out[-1])
predict = F.log_softmax(logits, dim=1)
return predict, hidden
#计算准确率的函数
def cal_accuracy(self, predictions, labels, batch_size):
#获取最大值的索引值
predict = torch.max(predictions.data, 1)[1]
#预测准确次数
correct_num = predict.eq(labels.data).sum()
#计算准确率并返回
accuracy = correct_num / batch_size
return accuracy.item()
#训练函数
def train(features, labels, model, criterion, optimizer, epochs,
num_batches, batch_size, byCuda):
#使用GPU时的转换
if byCuda:
model = model.cuda()
criterion = criterion.cuda()
#训练循环
for epoch in range(epochs):
#初始化
losses = 0.0
accuracy = 0.0
hidden_state = None
#处理一个批次
for i in range(num_batches):
#提取一批次数据
feature = features[i * batch_size: (i + 1) * batch_size, ]
label = labels[i * batch_size: (i + 1) * batch_size, ]
#根据模型和损失函数要求对数据进行调整
feature = feature.permute(1, 0, 2) #转置函数
label = torch.max(label, 1)[1] #NLLLoss函数要求格式
#张量转移到GPU上
if by_cuda:
feature = feature.cuda()
label = label.cuda()
#梯度清零
model.zero_grad()
#前向传播
pred, _ = model(feature, hidden_state)
#计算损失函数值
loss = criterion(pred, label)
#反向传播
loss.backward()
#对网络进行优化
optimizer.step()
#累计损失函数值并计算准确率
losses += loss.item()
accuracy += model.cal_accuracy(pred, label, batch_size)
#输出此轮训练结果
print('Epoch: %d | NLLoss: %.4f | Accuracy rate: %.2f'
% (epoch+1, losses / num_batches, 100.0 * accuracy / num_batches))
if __name__ == "__main__":
feature_file = 'features.npy' #特征数据文件
label_file = 'labels.npy' #标签数据文件
#检查是否存在npy文件
if (os.path.isfile(feature_file) and os.path.isfile(label_file)):
print('Loading npy files')
feature_data = np.load(feature_file)
label_data = np.load(label_file)
else:
print('Preprocessed files does not exist, please check!')
sys.exit()
#转换成张量并输出shape
features = torch.from_numpy(feature_data).type(torch.Tensor)
labels = torch.from_numpy(label_data).type(torch.LongTensor)
print('Features shape: ' + str(feature_data.shape))
print('Labels shape: ' + str(label_data.shape))
epochs = 200 #训练轮数
batch_size = 35 #每批样本数
num_batches = int(features.shape[0] / batch_size) #700÷35=20批
#构建LSTM模型并输出模型参数
print('Building LSTM model ...')
model = LSTM(33, 128, 10, 2)
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
print(model.parameters)
#检查是否有Cuda
by_cuda = torch.cuda.is_available()
if by_cuda:
print('Training on GPU ...')
else:
print('Training on CPU ...')
#训练过程
train(features, labels, model, criterion,
optimizer, epochs, num_batches, batch_size, by_cuda)
上述代码先检查是否有处理好的特征数据和标签数据文件,确认无误后开始构建模型,其间将输出如图12-19所示的信息。