|
发表于 2021-6-6 14:09:26
|
显示全部楼层
本楼为最佳答案
本帖最后由 小伤口 于 2021-6-6 14:10 编辑
这是我之前写的爬取京东评论和用户照片加上简单的可视化处理,你可以参考一下,
- import requests
- import json
- import queue
- import collections
- import threading
- import time
- import numpy as np
- import matplotlib.pyplot as plt
- from matplotlib.font_manager import FontProperties
- dict_data = []
- dict_data2 = []
- dict_data3 = []
- zhao_pian = []
- # 创建多线程
- #爬取时间
- class MyThread_j(threading.Thread):
- def __init__(self, q):
- threading.Thread.__init__(self)
- self.q = q
- #调用get_index()
- def run(self) -> None:
- self.get_index()
- def get_index(self):
- url = self.q.get()
- # 加请求头
- headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
- res = requests.get(url, headers=headers )
- #处理数据
- jd = json.loads(res.text.lstrip("fetchJSON_comment98(").rstrip(");"))
- #将时间存在data2.txt文件中
- for i in jd['comments']:
- x_zuobiao=i['creationTime'].split(' ')
- x_zuobiao=x_zuobiao[0].split('-')
- x_zuobiao=x_zuobiao[0]+x_zuobiao[1]
- with open("data2.txt", "a", encoding="utf-8") as file:
- file.write(x_zuobiao+ '\n')
- for i in range(5,10):
- locals()['MyThread_j' + str(i)] = MyThread_j
- # 创建多线程
- #爬取评论
- class MyThread(threading.Thread):
- def __init__(self, q):
- threading.Thread.__init__(self)
- self.q = q
- # 调用get_index()
- def run(self) -> None:
- self.get_index()
- def get_index(self):
- url = self.q.get()
- # 加请求头
- headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
- res = requests.get(url, headers=headers)
- #处理数据
- jd = json.loads(res.text.lstrip("jQuery593159(").rstrip(");"))
- #将评论存在data.txt中
- with open("data.txt", "a", encoding="utf-8") as file:
- for i in jd['comments']:
- file.write(i['content'] + '\n')
- #爬取照片并存在列表中
- try:
- for each in i['images']:
- photo = each['imgUrl'].split('//')
- photo = photo[-1]
- photo = 'https://' + photo
- print(photo)
- zhao_pian.append(photo)
- except:
- pass
- for i in range(5):
- locals()['MyThread_' + str(i)] = MyThread
- def main(url_d):
- # 创建队列存储url
- q = queue.Queue()
- url_d = url_d.split('/')
- url_d = ''.join(url_d[-1])
- url_d = url_d.split('.')
- url_d = url_d[0]
- for i in range(5):
- url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=" + url_d + "&score=1&sortType=6&page=" + str(i) + "&pageSize=10&isShadowSku=0&fold=1"
- q.put(url)
- # 如果队列不为空,就继续爬
- while not q.empty():
- t = MyThread_j5(q)
- t.start()
- t.join()
- for i in range(5,10):
- url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=" + url_d + "&score=1&sortType=6&page=" + str(
- i) + "&pageSize=10&isShadowSku=0&fold=1"
- q.put(url)
- # 如果队列不为空,就继续爬
- # 如果队列不为空,就继续爬
- while not q.empty():
- t_1 = MyThread_j6(q)
- t_1.start()
- t_1.join()
- for i in range(10,15):
- url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=" + url_d + "&score=1&sortType=6&page=" + str(
- i) + "&pageSize=10&isShadowSku=0&fold=1"
- q.put(url)
- # 如果队列不为空,就继续爬
- # 如果队列不为空,就继续爬
- while not q.empty():
- t_2 = MyThread_j7(q)
- t_2.start()
- t_2.join()
- for i in range(15,20):
- url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=" + url_d + "&score=1&sortType=6&page=" + str(
- i) + "&pageSize=10&isShadowSku=0&fold=1"
- q.put(url)
- # 如果队列不为空,就继续爬
- # 如果队列不为空,就继续爬
- while not q.empty():
- t_3 = MyThread_j8(q)
- t_3.start()
- t_3.join()
- for i in range(20,25):
- url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=" + url_d + "&score=1&sortType=6&page=" + str(
- i) + "&pageSize=10&isShadowSku=0&fold=1"
- q.put(url)
- # 如果队列不为空,就继续爬
- # 如果队列不为空,就继续爬
- while not q.empty():
- t_4 = MyThread_j9(q)
- t_4.start()
- t_4.join()
- for i in range(5):
- url = "https://club.jd.com/comment/productPageComments.action?callback=jQuery593159&productId=" + url_d + "&score=2&sortType=5&page=" + str(
- i) + "&pageSize=10&pin=null&_=1616905936767"
- q.put(url)
- # 如果队列不为空,就继续爬
- while not q.empty():
- t = MyThread_0(q)
- t.start()
- t.join()
- for i in range(10):
- url = "https://club.jd.com/comment/productPageComments.action?callback=jQuery593159&productId=" + url_d + "&score=1&sortType=5&page=" + str(
- i) + "&pageSize=10&pin=null&_=1616905936767"
- q.put(url)
- # 如果队列不为空,就继续爬
- # 如果队列不为空,就继续爬
- while not q.empty():
- t_1 = MyThread_1(q)
- t_1.start()
- t_1.join()
- for i in range(10, 15):
- url = "https://club.jd.com/comment/productPageComments.action?callback=jQuery593159&productId=" + url_d + "&score=2&sortType=5&page=" + str(
- i) + "&pageSize=10&pin=null&_=1616905936767"
- q.put(url)
- # 如果队列不为空,就继续爬
- # 如果队列不为空,就继续爬
- while not q.empty():
- t_2 = MyThread_2(q)
- t_2.start()
- t_2.join()
- for i in range(15, 20):
- url = "https://club.jd.com/comment/productPageComments.action?callback=jQuery593159&productId=" + url_d + "&score=3&sortType=5&page=" + str(
- i) + "&pageSize=10&pin=null&_=1616905936767"
- q.put(url)
- # 如果队列不为空,就继续爬
- # 如果队列不为空,就继续爬
- while not q.empty():
- t_3 = MyThread_3(q)
- t_3.start()
- t_3.join()
- for i in range(20, 25):
- url = "https://club.jd.com/comment/productPageComments.action?callback=jQuery593159&productId=" + url_d + "&score=3&sortType=5&page=" + str(
- i) + "&pageSize=10&pin=null&_=1616905936767"
- q.put(url)
- # 如果队列不为空,就继续爬
- # 如果队列不为空,就继续爬
- while not q.empty():
- t_4 = MyThread_4(q)
- t_4.start()
- t_4.join()
- #读取并保存照片
- name = 0
- for i in zhao_pian[:]:
- r = requests.get(i)
- name += 1
- with open(str(name) + '.jpg', 'wb') as f:
- f.write(r.content)
- #将时间提取出来,并进行排序计数
- with open("data2.txt") as file:
- for line in file.readlines():
- dict_data.append(int(line))
- dict_data.sort()
- dic = collections.Counter(dict_data)
- for i in dic:
- dict_data2.append(str(i))
- dict_data3.append(dic[i])
- #绘制折线图
- x = np.array(dict_data2)
- y = np.array(dict_data3)
- fig=plt.figure(figsize=(25, 10))
- fig.patch.set_facecolor('Turquoise')
- ax = fig.add_subplot(111)
- ax.patch.set_facecolor('DeepSkyBlue')
- plt.plot(x, y, 'r') # 折线 1 x 2 y 3 color
- plt.plot(x, y, 'red', lw=2) # 4 line w
- plt.scatter(x, y, color='red', marker='o')
- font = FontProperties(fname="mnjzbh.ttf")
- plt.title("销售趋势(按时间顺序)", fontsize=24,fontproperties=font)
- plt.xlabel("时间(年,月)", fontsize=14,fontproperties=font)
- plt.ylabel("销售数量(件)", fontsize=14,fontproperties=font)
- plt.savefig('Figure_1.png', bbox_inches='tight')
- plt.show()
- if __name__ == '__main__':
- url_d = input('输入网址:')
- start_time = time.time()
- main(url_d)
- end_time = time.time()
- print("耗时%d" % (end_time - start_time))
复制代码 |
|