0
点赞
收藏
分享

微信扫一扫

Linux内核编程(十五)网络设备驱动

目录

前言

一、获取数据并保存

1.获取好评数据

2.获取差评数据

二、处理数据并建立模型实现目标

1.处理数据

1.读取数据

2.使用jieba库对好评差评进行分词

3.去除停用词

2.建立模型

1.将处理之后的数据转换成词向量

2.训练模型并预测

三、对某评论是好评还是差评进行合理预测

总结


前言

        本篇内容将会介绍用算法对好评差评进行分类,并对某评论是好评还是差评进行合理预测

 

一、获取数据并保存

1.获取好评数据

import requests
import time
import re
from lxml import etree

head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
}

def pa():
for j in range(1, 200):
time.sleep(3)
url = f"https://review.suning.com/cluster_cmmdty_review/cluster-38249278-000000012389328846-0000000000-{j}-good.htm?originalCmmdtyType=generald488778a.10004.loverRight.166"
response = requests.get(url, headers=head)
sn_text = response.text
tree = etree.HTML(sn_text)

li_list = tree.xpath("//div[@class='rv-target-item']/div")
sn_coments = []
for i in li_list:
sn_coment = ''.join(i.xpath(".//p[@class='body-content']/text()")) # xpath返回的是列表
sn_coment = re.match(r".*[\u4e00-\u9fff]*.*?", sn_coment.strip()).group()
sn_coments.append(sn_coment)
print(sn_coment)
sngood.write(sn_coment + '\n')
# print(sn_coments)

if __name__ == '__main__':
sngood = open('sngood.txt', 'w', encoding='utf8')
pa()
sngood.close()

好评数据:

 

2.获取差评数据

import requests
import time
import re
from lxml import etree

head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
}

def pa():
for j in range(1, 3):
time.sleep(3)
url = f"https://review.suning.com/cluster_cmmdty_review/cluster-38249278-000000012389328846-0000000000-{j}-bad.htm?originalCmmdtyType=generald488778a.10004.loverRight.16610009"
response = requests.get(url, headers=head)
sn_text = response.text
tree = etree.HTML(sn_text)

li_list = tree.xpath("//div[@class='rv-target-item']/div")
sn_coments = []
for i in li_list:
sn_coment = ''.join(i.xpath(".//p[@class='body-content']/text()")) # xpath返回的是列表
sn_coment = re.match(r".*[\u4e00-\u9fff]*.*?", sn_coment.strip()).group()
sn_coments.append(sn_coment)
print(sn_coment)
snbad.write(sn_coment + '\n')
# print(sn_coments)

if __name__ == '__main__':
snbad = open('snbad.txt', 'w', encoding='utf8')
pa()
snbad.close()

差评数据:

 

二、处理数据并建立模型实现目标

1.处理数据

1.读取数据

import pandas as pd

cp_content = pd.read_csv('snbad.txt', encoding='utf8', sep='\t')
hp_content = pd.read_csv('sngood.txt', encoding='utf8', sep='\t')

 

2.使用jieba库对好评差评进行分词

"""
对差评分词
"""

import jieba

cp_segments = []
contents = cp_content['content'].values.tolist() # 将评论转换到列表内
for content in contents:
results = jieba.lcut(content) # 对每条评论进行分词
if len(results) > 1:
cp_segments.append(results)

cp_fc_results = pd.DataFrame({'content': cp_segments})
cp_fc_results.to_excel('snbad.xlsx', index=False)

"""
对好评分词
"""

hp_segments = []
contents = hp_content['content'].values.tolist() # 将评论转换到列表内
for content in contents:
results = jieba.lcut(content) # 对每条评论进行分词
if len(results) > 1:
hp_segments.append(results)

hp_fc_results = pd.DataFrame({'content': hp_segments})
hp_fc_results.to_excel('sngood.xlsx', index=False)

 

3.去除停用词

# 导入停词库
stopwords = pd.read_csv('StopwordsCN.txt', encoding='utf8', engine='python', index_col=False)


# 定义去除停用词函数
def drop_stopwords(contents, stopwords):
segments_clean = []
for content in contents:
line_clean = []
for word in content:
if word in stopwords:
continue
line_clean.append(word)
segments_clean.append(line_clean) # 每一个字符串代表每条评论去除停用词之后的词
return segments_clean


# 调用去除停用词函数
stopwords = stopwords['stopword'].tolist() # list类型 / stopwords['stopword'].values ndarray类型

contents = cp_fc_results['content'].tolist()
cp_fc_contents_clean_s = drop_stopwords(contents, stopwords)

contents = hp_fc_results['content'].tolist()
hp_fc_contents_clean_s = drop_stopwords(contents, stopwords)

 

2.建立模型

1.将处理之后的数据转换成词向量

  • 注意!!! 要使用训练集的数据去建立词库
"""
朴素贝叶斯分类
"""

'''给每个数据添加数字标签'''
cp_train = pd.DataFrame({'segments_clean': cp_fc_contents_clean_s, 'label': 1})
hp_train = pd.DataFrame({'segments_clean': hp_fc_contents_clean_s, 'label': 0})

pj_train = pd.concat([cp_train, hp_train]) # 上下相连
pj_train.to_excel('pj_train.xlsx', index=False)

'''数据切分'''
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = \
train_test_split(pj_train['segments_clean'].values, pj_train['label'].values, random_state=0)
# values 可加可不加 作用是将pj_train['segments_clean']获取到的series对象转换成numpy数组
# pj_train['segments_clean'].values 将该列所有值按顺序取出来,不带索引 取出来之后自动加上索引 x_train 也是随机取值 不带索引 取出来之后自动加上索引

'''将所有词转换成词向量'''
words = []
for index in range(len(x_train)):
words.append(' '.join(x_train[index]))
# print(words)

from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(max_features=4000, lowercase=False, ngram_range=(1, 1))
# lowercase参数的功能:把所有的词是是否需要转换为小写。False。
vec.fit(words) # 传入训练集的所有文字, 根据文字构建词汇表

 

2.训练模型并预测

  • 应将原数据用transform转换成词向量矩阵之后再带入模型进行训练以及预测
'''导入朴素贝叶斯分类器'''
from sklearn.naive_bayes import MultinomialNB

x_train = vec.transform(words)
classifier = MultinomialNB(alpha=0.1)
classifier.fit(x_train, y_train)

train_predict = classifier.predict(x_train)

# 训练集预测得分
from sklearn import metrics

print(metrics.classification_report(y_train, train_predict))

# 测试集数据进行分析
test_words = []
for line_index in range(len(x_test)):
test_words.append(' '.join(x_test[line_index]))

x_test = vec.transform(test_words)

test_predict = classifier.predict(x_test)
print(metrics.classification_report(y_test, test_predict))
print(test_predict)

输出结果:

              precision    recall  f1-score   support

0 1.00 0.98 0.99 1163
1 0.21 1.00 0.35 7

accuracy 0.98 1170
macro avg 0.61 0.99 0.67 1170
weighted avg 1.00 0.98 0.98 1170

precision recall f1-score support

0 0.99 0.99 0.99 386
1 0.50 0.60 0.55 5

accuracy 0.99 391
macro avg 0.75 0.80 0.77 391
weighted avg 0.99 0.99 0.99 391

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

 

三、对某评论是好评还是差评进行合理预测

  1. 对评论进行分词
  2. 去除停用词
  3. 转换成词向量矩阵
  4. 进行预测
"""单条评论判断"""

def text(comment):
# 1.分词
a = jieba.lcut(comment)

# 2.去除停用词
b = []
for i in a:
if i not in stopwords and len(i.strip()) > 0:
b.append(i)

c = []
c.append(' '.join(b[:])) # 将多个字符串连接并放到列表里

# 3.转换成词向量矩阵
c_train = vec.transform(c)

# 4.进行预测
c_pr = classifier.predict(c_train)
return c_pr

comment1 = '这个玩意真好,我很喜欢'
comment2 = '这个玩意太垃圾了'
print(text(comment1))
print(text(comment2))

输出:

  • 分类正确
[0]
[1]

 

总结

        将这个模型训练出来,有自动化评论分类改进客户服务和个性化推荐等作用。

举报

相关推荐

0 条评论