代码可以从0到1去实现一个深度学习项目。基本的模型 textcnn、cnn、lstm、bertd等
刚接触pytorch的小白学习使用学习。
注意:
转化成torch的方式 注意
self._x = torch.from_numpy(np.float32(train_data))
self._y = torch.from_numpy(np.float32(labels))
np.random.rand()生成的数据是float64的,而torch默认是float32,
feature = torch.from_numpy(np.float32())
模板文件包 基于textcnn的基因序列分类
pytorchtextcnpytorchtextcnpytorchtextcn开发模板-Python文档类资源
常用的库导入 这些库在pytoch开发中会经常的用到
from collections import Counter
import torch
from torch import nn
from torch import optim
import math
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import re
from tqdm import tqdm
from torch.utils.data import random_split
from collections import Counter
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import warnings
warnings.filterwarnings('ignore')
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.optim as optim
from torch.utils.data import random_split
import os
import torch.nn.functional as F
from gensim.models import KeyedVectors
# gensim 4.0.0
# sentences = word2vec.LineSentence(“train_word.txt")
# model = word2vec.Word2Vec(sentences, iter=iter1, sg=sg, hs=hs, min_count=1, window=window, size=size)
# model.wv.save_word2vec_format(myword2vec.model,binary=False)
使用Dataset类读取数据的模板代码
new_model = KeyedVectors.load_word2vec_format('myword2vec.model', binary=False)
class mydataset(Dataset):
def __init__(self): # 读取加载数据
labels = []
with open("Rice_880.fasta", "r", encoding="utf-8") as f:
f = f.readlines()
for i in range(0, len(f), 2):
temp = f[i:i + 2]
temp_label = temp[0][1]
if temp_label == "n": temp_label = 0
if temp_label == "p": temp_label = 1
labels.append(temp_label)
train_data = []
with open("myword.txt", "r", encoding="utf-8") as f:
f = f.readlines()
for i in f:
one_line = str(i).split("\n")[0].split(" ")
temp_arr = []
for one in one_line:
vec = new_model[one]
temp_arr.append(vec)
train_data.append(np.array(temp_arr))
# 转化成torch的方式 注意
self._x = torch.from_numpy(np.float32(train_data))
self._y = torch.from_numpy(np.float32(labels))
# np.random.rand()生成的数据是float64的,而torch默认是float32,于是就出现了问题解决办法
# feature = torch.from_numpy(np.float32())
self._len = len(labels)
def __getitem__(self, item):
return self._x[item], self._y[item]
def __len__(self): # 返回整个数据的长度
return self._len
data = mydataset()
# 划分 训练集 测试集
train_data, test_data = random_split(data, [round(0.8 * data._len), round(0.2 * data._len)]) # 这个参数有的版本没有 generator=torch.Generator().manual_seed(0)
# 随机混乱顺序划分的 四舍五入
torch 模型搭建模板
class textCNN(nn.Module):
def __init__(self, ):
super(textCNN, self).__init__()
Dim = 100 ##每个词向量长度
Cla = 2 ##类别数
Ci = 1 ##输入的channel数 一般都是1 与batch大小无关
Knum = 50 ## 每种卷积核的数量
Ks = [2, 3, 4] ## 卷积核list,形如[2,3,4]
dropout = 0.5
self.convs = nn.ModuleList([nn.Conv2d(Ci, Knum, (K, Dim)) for K in Ks]) ## 卷积层
self.dropout = nn.Dropout(dropout)
self.fc = nn.Linear(len(Ks) * Knum, Cla) ##全连接层
def forward(self, x):
x = x.unsqueeze(1) # (N,Ci,W,D)原本进来的数据是【batch,高,宽】---》【batch,1,高,宽】
x = [F.relu(conv(x)).squeeze(3) for conv in self.convs] # len(Ks)*(N,Knum,W)
x = [F.max_pool1d(line, line.size(2)).squeeze(2) for line in x] # len(Ks)*(N,Knum)
x = torch.cat(x, 1) # (N,Knum*len(Ks))
x = self.dropout(x)
logit = self.fc(x)
return logit
# 这个函数是测试用来测试x_test y_test 数据 函数
def eval_test(model): # 返回的是这10个 测试数据的平均loss
test_epoch_loss = []
with torch.no_grad():
optimizer.zero_grad()
for step, (test_x, test_y) in enumerate(test_loader):
test_x = test_x.to(device)
test_y = test_y.to(device)
y_pre = model(test_x)
test_y = test_y
test_loss = loss_function(y_pre, test_y.long())
test_epoch_loss.append(test_loss.item())
return np.mean(test_epoch_loss)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
epochs = 50
batch_size = 64
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)
model = textCNN().to(device)
loss_function = torch.nn.CrossEntropyLoss().to(device) # 损失函数的计算 交叉熵损失函数计算
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
print(model)
训练预测
sum_train_epoch_loss = [] # 存储每个epoch 下 训练train数据的loss
sum_test_epoch_loss = [] # 存储每个epoch 下 测试 test数据的loss
best_test_loss = 100000000
for epoch in range(epochs):
epoch_loss = []
for step, (train_x, train_y) in enumerate(train_loader):
train_x=train_x.to(device)
train_y=train_y.to(device)
y_pred = model(train_x)
single_loss = loss_function(y_pred, train_y.long())
single_loss.backward() # 调用backward()自动生成梯度
optimizer.step() # 使用optimizer.step()执行优化器,把梯度传播回每个网络
epoch_loss.append(single_loss.item())
train_epoch_loss = np.mean(epoch_loss)
test_epoch_loss = eval_test(model) # 测试数据的平均loss
if test_epoch_loss < best_test_loss:
best_test_loss = test_epoch_loss
print("best_test_loss", best_test_loss)
best_model = model
sum_train_epoch_loss.append(train_epoch_loss)
sum_test_epoch_loss.append(test_epoch_loss)
print("epoch:" + str(epoch) + " train_epoch_loss: " + str(train_epoch_loss) + " test_epoch_loss: " + str(
test_epoch_loss))
torch.save(best_model, 'best_model.pth')
print(sum_train_epoch_loss)
print(sum_test_epoch_loss)
fig = plt.figure(facecolor='white', figsize=(10, 7))
plt.xlabel('第几个epoch')
plt.ylabel('loss值')
plt.xlim(xmax=len(sum_train_epoch_loss), xmin=0)
plt.ylim(ymax=max(sum_train_epoch_loss), ymin=0)
# 画两条(0-9)的坐标轴并设置轴标签x,y
x1 = [i for i in range(0, len(sum_train_epoch_loss), 1)] # 随机产生300个平均值为2,方差为1.2的浮点数,即第一簇点的x轴坐标
y1 = sum_train_epoch_loss # 随机产生300个平均值为2,方差为1.2的浮点数,即第一簇点的y轴坐标
x2 = [i for i in range(0, len(sum_test_epoch_loss), 1)]
y2 = sum_test_epoch_loss
colors1 = '#00CED4' # 点的颜色
colors2 = '#DC143C'
area = np.pi * 4 ** 1 # 点面积
# 画散点图
plt.scatter(x1, y1, s=area, c=colors1, alpha=0.4, label='train_loss')
plt.scatter(x2, y2, s=area, c=colors2, alpha=0.4, label='val_loss')
# plt.plot([0,9.5],[9.5,0],linewidth = '0.5',color='#000000')
plt.legend()
# plt.savefig(r'C:\Users\jichao\Desktop\大论文\12345svm.png', dpi=300)
plt.show()
import sklearn
# 模型加载:
model.load_state_dict(torch.load('best_model.pth').cpu().state_dict())
model.eval()
test_pred = []
test_true = []
# 直观的进行测试:一共95个学生的信息 76个训练 19个进行训练
with torch.no_grad():
optimizer.zero_grad()
for step, (test_x, test_y) in enumerate(test_loader):
test_x = test_x.to(device)
test_y = test_y.to(device)
y_pre = model(test_x)
y_pre = torch.argmax(y_pre, dim=1)
for i in y_pre:
test_pred.append(i.item())
for i in test_y:
test_true.append(i.item())
print(test_pred[:10])
print(test_true[:10])
Acc = accuracy_score(test_pred, test_true)
Mcc = sklearn.metrics.confusion_matrix(test_pred, test_true)
Sn = sklearn.metrics.precision_score(test_pred, test_true)
Sp = sklearn.metrics.recall_score(test_pred, test_true)
print("Acc",Acc)
print("Mcc",Mcc)
print("Sn",Sn)
print("Sp",Sp)
一个基因序列分类的效果图