寻找RSS资源的过程中,感谢:
https://zhidao.baidu.com/question/19599693.html
原文的两个RSS源都访问不了了:http://newyork.craigslist.org/stp/index.rss、http://sfbay.craigslist.org/stp/index.rss
此处替换成:http://www.nasa.gov/rss/dyn/image_of_the_day.rss、http://rss.cnn.com/rss/cnn_topstories.rss (两个文件的entries长度分别为 60、69)
此处为避免再次访问不了,就直接保存为文件,每次从文件中读取,文件较大,我就直接放到我的CSDN资源里了,打包上传的内容包括所有代码、注释、RSS文件等:
代码并非完全照搬作者,有一定的参考意义,并且python3下可行
那么,正文就从这里开始啦!(我的代码都是可直接运行的,只要环境正确)
1、bayes01_base_model.py
'''
训练朴素贝叶斯分类器
两种模型:词集模型、词袋模型
'''
from numpy import *
'''
数据取自斑点犬爱好者的留言板,进行此条切分后得到 postingList
classVec 对数据进行标记,1:侮辱性,0:非侮辱性
返回数据集与标签
'''
def loadDataSet():
# flea 虱子 dalmation 一种斑点狗 lick 舔 steak 牛排
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1]
return postingList, classVec
# 建立无重复词汇表
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet)
'''
采用词集模型:即对每条文档只记录某个词汇是否存在,而不记录出现的次数
创建一个与词汇表长度一致的0向量,在当前样本中出现的词汇标为1
将一篇文档(一条留言)转换为词向量
'''
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print("the word: ", word, "is not in my Vocabulary!")
return returnVec
'''
采用词袋模型:即对每条文档记录各个词汇出现的次数
与词集模型的代码几乎一致,除了计算词汇量的几个地方
'''
def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = returnVec[vocabList.index(word)] + 1
return returnVec
'''
在已知类别的情况下统计各个词出现的频率
trainMatrix: 文档矩阵,trainCategory:标签向量
'''
def trainNB0(trainMatrix, trainCategory):
# 文档数
numTrainDocs = len(trainMatrix)
# 总词汇数
numWords = len(trainMatrix[0])
# 文档中侮辱类文档的概率
pAbusive = sum(trainCategory) / float(numTrainDocs)
'''
优化1:为避免一个概率值为0,导致各个概率值的乘积为零,此处进行优化;
改完之后效果显著
'''
# p0Num = p1Num = zeros(numWords)
# p0Denom = p1Denom = 0.0 # denom:分母项
p0Num = p1Num = ones(numWords)
p0Denom = p1Denom = 2.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
# 对于每个文档来说,里面的词只分有和没有
# 对于所有文档来说,关注的是词在每个文档中是否出现的概率
p1Num = p1Num + trainMatrix[i]
p1Denom = p1Denom + sum(trainMatrix[i])
else:
p0Num = p0Num + trainMatrix[i]
p0Denom = p0Denom + sum(trainMatrix[i])
'''
优化2:很多很小的概率值相乘可能导致溢出,或者浮点数舍入导致的错误
方法就是取对数 ln, 概率比值会有所变化,但不影响极值点和大小上的比较
'''
p1Vect = log(p1Num / p1Denom)
p0Vect = log(p0Num / p0Denom)
return p0Vect, p1Vect, pAbusive
# 输入某个文档,计算总概率值,比较两者得出结论
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1)
try:
p0 = sum(vec2Classify * p0Vec) + log(1 - pClass1)
except BaseException:
print(pClass1)
if p1 > p0:
return 1
else:
return 0
def testingNB():
listOPosts, listClasses = loadDataSet() # 获取数据集与标签
myVocabList = createVocabList(listOPosts) # 建立词汇表
myVocabList.sort() # 排序,使其顺序一致,打印出来好查看
# print(myVocabList) # ['I', 'ate', 'buying', 'cute', 'dalmation', 'dog', 'flea', 'food', 'garbage', 'has', 'help', 'him', 'how', 'is', 'licks', 'love', 'maybe', 'mr', 'my', 'not', 'park', 'please', 'posting', 'problems', 'quit', 'so', 'steak', 'stop', 'stupid', 'take', 'to', 'worthless']
'''
词集模型测试
'''
trainMat = [] # 将所有文档转化为一个文档矩阵
for postingDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList, postingDoc))
p0V, p1V, pAb = trainNB0(trainMat, listClasses) # 训练分类器
# print(pAb) # 任意文档属于侮辱性文档的概率=0.5
# print(p0V) # 在两个类别下的概率
# print(p1V)
# print(column_stack((myVocabList, p0V, p1V))) # 更好地查看各个词对应的侮辱性的概率
# 测试
testEntry = ['love', 'my', 'dalmation', 'not', 'licks']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print(testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb))
testEntry = ['stupid', 'garbage']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print(testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb))
'''
词袋模型测试
'''
trainMat = [] # 将所有文档转化为一个文档矩阵
for postingDoc in listOPosts:
trainMat.append(bagOfWords2VecMN(myVocabList, postingDoc))
p0V, p1V, pAb = trainNB0(trainMat, listClasses) # 训练分类器
testEntry = ['love', 'my', 'dalmation', 'not', 'licks']
thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry))
print(testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb))
testEntry = ['stupid', 'garbage']
thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry))
print(testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb))
if __name__ == "__main__":
testingNB()
2、bayes02_spam_classification.py
'''
使用贝叶斯过滤垃圾邮件:先从文本中得到字符串列表,然后生成词向量
1、收集数据
2、准备数据:文本内容解析成词条向量
3、分析数据:检查词条确保解析的正确性
4、训练算法:已实现
5、测试算法:构建一个新的测试函数来计算文档及的错误率
6、使用算法:构建一个完整的程序对一组文档进行分类,将错分的文档输出到屏幕上
'''
import random
from bayes01_base_model import *
# 采用正则表达式来切分,把除单词、数字以外的字符全部作为分隔符,去除空字符串,单词统一为小写
# 为避免url中无意义的字符串乱入,把长度小于3的字符串去掉
def textParse(bigString):
import re
listOfTokens = re.split('[^\w\u4e00-\u9fff]+', bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
docList = []; classList = []; fullText = []
for i in range(1, 26):
filename = 'email/spam/' + str(i) + '.txt'
wordList = textParse(open(filename).read()) # 解析一封邮件
docList.append(wordList) # 构建解析后的所有邮件列表
fullText.extend(wordList) # 构建总词汇库
classList.append(1) # 标记垃圾邮件
wordList = textParse(open('email/ham/%d.txt' % i).read()) # 解析一封邮件
docList.append(wordList) # 构建解析后的所有邮件列表
fullText.extend(wordList) # 构建总词汇库
classList.append(0) # 标记非垃圾邮件
vocabList = createVocabList(docList) # 词汇去重
trainingSet = list(range(50)); testSet = []
# 将50封邮件中的随机10封邮件加入测试集
for i in range(0, 10):
randIndex = int(random.uniform(0, len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
# 训练
trainMat = []
trainClasses = []
for docIndex in trainingSet:
trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = setOfWords2Vec(vocabList, docList[docIndex])
if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
errorCount = errorCount + 1
print(classList[docIndex], "误判为", (1-classList[docIndex]), ": ", docList[docIndex])
errorRate = float(errorCount) / len(testSet)
print("the error rate is", errorRate)
return errorRate
# trainMat = []
# trainClasses = []
# for docIndex in trainingSet:
# trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
# trainClasses.append(classList[docIndex])
# p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
# errorCount = 0
# for docIndex in testSet:
# wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
# if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
# errorCount = errorCount + 1
# print(docList[docIndex])
# errorRate = float(errorCount) / len(testSet)
# print("the error rate is", errorRate)
# return errorRate
if __name__ == "__main__":
print("1次交叉验证的平均错误率是,", spamTest())
errAverage = 0
for i in range(10):
errAverage = errAverage + spamTest()
errAverage = errAverage / 10
print("10次交叉验证的平均错误率是,", errAverage)
# 有多种方式可提高分类器的分类性能
3、bayes03_region_related_phrase.py
'''
利用朴素贝叶斯来发现地域相关的用词
1、收集数据
2、准备数据
3、分析数据
4、训练算法
5、测试算法
6、使用算法
参考网址有:(花了大量时间找rss英文源,终于让我找到两个数据量还行的,值得了)
javascript:void(0)
https://zhidao.baidu.com/question/19599693.html
'''
import feedparser
from bayes02_spam_classification import *
# 计算词汇出现评率,返回最活跃的前若干个词汇
def calMostFreq(vocabList, fullText):
import operator
freqDict = {}
for token in vocabList:
freqDict[token] = fullText.count(token)
sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True)
return sortedFreq[:30]
def stopWords():
wordList = open('RSS_file/stopword.txt').read()
listTokens = wordList.split()
return [tok.lower() for tok in listTokens]
def localWords(feed1, feed0):
import feedparser
docList = []; classList = []; fullText = []
minLen = min(len(feed1['entries']), len(feed0['entries']))
for i in range(minLen):
wordList = textParse(feed1['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(feed0['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
top30Words = calMostFreq(vocabList, fullText)
# 删掉最活跃词汇,保留不活跃词汇
for pairW in top30Words:
if pairW[0] in vocabList: vocabList.remove(pairW[0])
# 再删掉自定义的一些日常生活中的各种辅助词、高频词
stopWordList = stopWords()
for stopWord in stopWordList:
if stopWord in vocabList:
vocabList.remove(stopWord)
averageErrorRate = 0
# 10次交叉验证
for k in range(10):
trainingSet = list(range(2 * minLen))
testSet = []
# 取若干条数据作为测试数据
for i in range(50):
randIndex = int(random.uniform(0, len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat = []; trainingClass = []
for docIndex in trainingSet:
trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
trainingClass.append(classList[docIndex])
p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainingClass))
errorCount = 0
for docIndex in testSet:
wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
errorCount = errorCount + 1
errorRate = float(errorCount) / len(testSet)
averageErrorRate = averageErrorRate + errorRate
# print("the error rate is", errorRate)
averageErrorRate = averageErrorRate / 10.0
print("the average error rate is", averageErrorRate)
return vocabList, p0V, p1V
# 最具表征性的词汇显示函数
def getTopWords(nasa, cnn):
import operator
vocabList, p0V, p1V = localWords(nasa, cnn)
topNasa = []; topCnn = []
for i in range(len(p0V)):
if p1V[i] > -6.0: topNasa.append((vocabList[i], p1V[i]))
if p0V[i] > -6.0: topCnn.append((vocabList[i], p0V[i]))
print("--------------------------nasa--------------------------")
sortedNasa = sorted(topNasa, key=lambda pair: pair[1], reverse=True)
for item in sortedNasa:
print(item[0])
print("--------------------------cnn--------------------------")
sortedCnn = sorted(topCnn, key=lambda pair: pair[1], reverse = True)
for item in sortedCnn:
print(item[0])
if __name__ == "__main__":
# 原文的两个源都访问不了了:http://newyork.craigslist.org/stp/index.rss、http://sfbay.craigslist.org/stp/index.rss
# 此处替换成:http://www.nasa.gov/rss/dyn/image_of_the_day.rss、http://rss.cnn.com/rss/cnn_topstories.rss
# 此处为避免再次访问不了,就直接保存为文件,每次从文件中读取
nasa = feedparser.parse('RSS_file/image_of_the_day.rss')
cnn = feedparser.parse('RSS_file/cnn_rss.xml')
# print(len(nasa['entries']))
# print(nasa['entries'])
# 结果准确率不高,0.5左右,可能是因为数据相似度比较高,且算法比较原始,还没有优化
# localWords(nasa, cnn)
getTopWords(nasa, cnn)