Wnmp本地部署结合内网穿透实现任意浏览器远程访问本地服务-CFANZ编程社区

1. 项目简介

本项目旨在通过Python技术栈对京东平台上的手机数据进行抓取、分析并构建一个简单的手机推荐系统。主要功能包括：

网络爬虫：从京东获取手机数据；
数据分析：统计各厂商手机销售分布、市场占有率、价格区间和好评率；
可视化展示：使用ECharts进行数据可视化；
推荐系统：根据分析结果为用户推荐手机。

基于Python的京东手机数据可视化分析和推荐系统

2. 电商手机数据网络爬虫

使用Python的requests库和BeautifulSoup库实现对京东手机页面的爬取。需要处理分页、动态加载等问题。

def getCommentData(prod_id, format_url, proc, i, maxPage):
    '''
    format_url: 格式化的字符串架子，在循环中给它添上参数
    proc: 商品的productID，标识唯一的商品号
    i: 商品的排序方式，例如全部商品、晒图、追评、好评等
    maxPage: 商品的评论最大页数
    '''
    sig_comment = []
    global list_comment
    cur_page = 0
    while cur_page < maxPage:
        cur_page += 1
        # url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv%s&score=%s&sortType=5&page=%s&pageSize=10&isShadowSku=0&fold=1'%(proc,i,cur_page)
        url = format_url.format(proc, i, cur_page)  # 给字符串添上参数
        try:
            response = requests.get(url=url, headers=comment_headers, verify=False)
            time.sleep(np.random.rand() * 2)
            jsonData = response.text
            startLoc = jsonData.find('{')
            # print(jsonData[::-1])//字符串逆序
            jsonData = jsonData[startLoc:-2]
            jsonData = json.loads(jsonData)
            pageLen = len(jsonData['comments'])
            print("当前第%s页，最大%s页" % (cur_page, maxPage))
            for j in range(0, pageLen):
                userId = jsonData['comments'][j]['id']  # 用户ID
                # 数据解析代码
                # ...........
        except:
            time.sleep(5)
            cur_page -= 1
            print('网络故障或者是网页出现了问题，五秒后重新连接')


def fetch_phone_comment(product, product_id):
    print('抓取 {} 产品的评论数据。。。。。。'.format(product))
    format_url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&{0}&score={1}&sortType=5&page={2}&pageSize=10&isShadowSku=0&fold=1'

    list_comment = []
    sig_comment = []
    proc = 'productId={}'.format(product_id)
    i = -1
    while i < 7:  # 遍历排序方式
        i += 1
        if (i == 6):
            continue
        # 先访问第0页获取最大页数，再进行循环遍历
        url = format_url.format(proc, i, 0)
        print(url)
        try:
            response = requests.get(url=url, headers=comment_headers, verify=False)
            # 数据解析代码
            # ...........
            getCommentData(proc, format_url, proc, i, jsonData['maxPage'])  # 遍历每一页
        except Exception as e:
            i -= 1
            print("the error is ", e)
            time.sleep(5)

    return list_comment



def fetch_brand_phones(brand_name, brand_href):
    """ 抓取该品牌的手机数据 """
    page = 1
    size = 1

    while page < 10:
        brand_page_href = brand_href + '&page={}&s={}&click=0'.format(page, size)
        resp = requests.get(brand_page_href, headers=headers)
        soup = BeautifulSoup(resp.text, 'lxml')
        items = soup.find_all('li', attrs={'class': 'gl-item'})

        all_phones = []
        for item in items:
            # 图片
            # 价格
            # 手机产品名称
            # 产品的详细链接
            # 抓取该产品的详细信息，此处为销量
            # 数据解析代码
            # ...........

            phone = {
                '品牌': brand_name,
                '图片': img,
                '价格': price,
                '产品名称': name,
                '链接': phone_href,
                '评论': json.dumps(comment_dict, ensure_ascii=False)
            }
            all_phones.append(phone)