[已解决]xdm，爬虫求助

小伤口 · 发表于 2021-6-6 14:09:26

本帖最后由小伤口于 2021-6-6 14:10 编辑

这是我之前写的爬取京东评论和用户照片加上简单的可视化处理，你可以参考一下，
import requests

import json

import queue

import collections

import threading

import time

import numpy as np

import matplotlib.pyplot as plt

from matplotlib.font_manager import FontProperties

dict_data = []

dict_data2 = []

dict_data3 = []

zhao_pian = []

# 创建多线程

#爬取时间

class MyThread_j(threading.Thread):

def __init__(self, q):

      threading.Thread.__init__(self)

      self.q = q

#调用get_index()

def run(self) -> None:

      self.get_index()

def get_index(self):

      url = self.q.get()

      # 加请求头

      headers = {

         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}

      res = requests.get(url, headers=headers )

      #处理数据

      jd = json.loads(res.text.lstrip("fetchJSON_comment98(").rstrip(");"))

      #将时间存在data2.txt文件中

      for i in jd['comments']:

         x_zuobiao=i['creationTime'].split(' ')

         x_zuobiao=x_zuobiao[0].split('-')

         x_zuobiao=x_zuobiao[0]+x_zuobiao[1]

         with open("data2.txt", "a", encoding="utf-8") as file:

            file.write(x_zuobiao+ '\n')

for i in range(5,10):

locals()['MyThread_j' + str(i)] = MyThread_j

# 创建多线程

#爬取评论

class MyThread(threading.Thread):

def __init__(self, q):

      threading.Thread.__init__(self)

      self.q = q

# 调用get_index()

def run(self) -> None:

      self.get_index()

def get_index(self):

      url = self.q.get()

      # 加请求头

      headers = {

         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}

      res = requests.get(url, headers=headers)

      #处理数据

      jd = json.loads(res.text.lstrip("jQuery593159(").rstrip(");"))

      #将评论存在data.txt中

      with open("data.txt", "a", encoding="utf-8") as file:

         for i in jd['comments']:

            file.write(i['content'] + '\n')

         #爬取照片并存在列表中

         try:

            for each in i['images']:

                  photo = each['imgUrl'].split('//')

                  photo = photo[-1]

                  photo = 'https://' + photo

                  print(photo)

                  zhao_pian.append(photo)

         except:

            pass

for i in range(5):

locals()['MyThread_' + str(i)] = MyThread

def main(url_d):

# 创建队列存储url

q = queue.Queue()

url_d = url_d.split('/')

url_d = ''.join(url_d[-1])

url_d = url_d.split('.')

url_d = url_d[0]

for i in range(5):

      url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=" + url_d + "&score=1&sortType=6&page=" + str(i) + "&pageSize=10&isShadowSku=0&fold=1"

      q.put(url)

      # 如果队列不为空，就继续爬

      while not q.empty():

         t = MyThread_j5(q)

         t.start()

         t.join()

for i in range(5,10):

      url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=" + url_d + "&score=1&sortType=6&page=" + str(

         i) + "&pageSize=10&isShadowSku=0&fold=1"

      q.put(url)

      # 如果队列不为空，就继续爬

      # 如果队列不为空，就继续爬

      while not q.empty():

         t_1 = MyThread_j6(q)

         t_1.start()

         t_1.join()

for i in range(10,15):

      url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=" + url_d + "&score=1&sortType=6&page=" + str(

         i) + "&pageSize=10&isShadowSku=0&fold=1"

      q.put(url)

      # 如果队列不为空，就继续爬

      # 如果队列不为空，就继续爬

      while not q.empty():

         t_2 = MyThread_j7(q)

         t_2.start()

         t_2.join()

for i in range(15,20):

      url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=" + url_d + "&score=1&sortType=6&page=" + str(

         i) + "&pageSize=10&isShadowSku=0&fold=1"

      q.put(url)

      # 如果队列不为空，就继续爬

      # 如果队列不为空，就继续爬

      while not q.empty():

         t_3 = MyThread_j8(q)

         t_3.start()

         t_3.join()

for i in range(20,25):

      url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=" + url_d + "&score=1&sortType=6&page=" + str(

         i) + "&pageSize=10&isShadowSku=0&fold=1"

      q.put(url)

      # 如果队列不为空，就继续爬

      # 如果队列不为空，就继续爬

      while not q.empty():

         t_4 = MyThread_j9(q)

         t_4.start()

         t_4.join()

for i in range(5):

      url = "https://club.jd.com/comment/productPageComments.action?callback=jQuery593159&productId=" + url_d + "&score=2&sortType=5&page=" + str(

         i) + "&pageSize=10&pin=null&_=1616905936767"

      q.put(url)

      # 如果队列不为空，就继续爬

      while not q.empty():

         t = MyThread_0(q)

         t.start()

         t.join()

for i in range(10):

      url = "https://club.jd.com/comment/productPageComments.action?callback=jQuery593159&productId=" + url_d + "&score=1&sortType=5&page=" + str(

         i) + "&pageSize=10&pin=null&_=1616905936767"

      q.put(url)

      # 如果队列不为空，就继续爬

      # 如果队列不为空，就继续爬

      while not q.empty():

         t_1 = MyThread_1(q)

         t_1.start()

         t_1.join()

for i in range(10, 15):

      url = "https://club.jd.com/comment/productPageComments.action?callback=jQuery593159&productId=" + url_d + "&score=2&sortType=5&page=" + str(

         i) + "&pageSize=10&pin=null&_=1616905936767"

      q.put(url)

      # 如果队列不为空，就继续爬

      # 如果队列不为空，就继续爬

      while not q.empty():

         t_2 = MyThread_2(q)

         t_2.start()

         t_2.join()

for i in range(15, 20):

      url = "https://club.jd.com/comment/productPageComments.action?callback=jQuery593159&productId=" + url_d + "&score=3&sortType=5&page=" + str(

         i) + "&pageSize=10&pin=null&_=1616905936767"

      q.put(url)

      # 如果队列不为空，就继续爬

      # 如果队列不为空，就继续爬

      while not q.empty():

         t_3 = MyThread_3(q)

         t_3.start()

         t_3.join()

for i in range(20, 25):

      url = "https://club.jd.com/comment/productPageComments.action?callback=jQuery593159&productId=" + url_d + "&score=3&sortType=5&page=" + str(

         i) + "&pageSize=10&pin=null&_=1616905936767"

      q.put(url)

      # 如果队列不为空，就继续爬

      # 如果队列不为空，就继续爬

      while not q.empty():

         t_4 = MyThread_4(q)

         t_4.start()

         t_4.join()

#读取并保存照片

name = 0

for i in zhao_pian[:]:

      r = requests.get(i)

      name += 1

      with open(str(name) + '.jpg', 'wb') as f:

         f.write(r.content)

#将时间提取出来，并进行排序计数

with open("data2.txt") as file:

      for line in file.readlines():

         dict_data.append(int(line))

dict_data.sort()

dic = collections.Counter(dict_data)

for i in dic:

      dict_data2.append(str(i))

      dict_data3.append(dic[i])

#绘制折线图

x = np.array(dict_data2)

y = np.array(dict_data3)

fig=plt.figure(figsize=(25, 10))

fig.patch.set_facecolor('Turquoise')

ax = fig.add_subplot(111)

ax.patch.set_facecolor('DeepSkyBlue')

plt.plot(x, y, 'r')  # 折线 1 x 2 y 3 color

plt.plot(x, y, 'red', lw=2)  # 4 line w

plt.scatter(x, y, color='red', marker='o')

font = FontProperties(fname="mnjzbh.ttf")

plt.title("销售趋势(按时间顺序)", fontsize=24,fontproperties=font)

plt.xlabel("时间(年，月)", fontsize=14,fontproperties=font)

plt.ylabel("销售数量(件)", fontsize=14,fontproperties=font)

plt.savefig('Figure_1.png', bbox_inches='tight')

plt.show()

if __name__ == '__main__':

url_d = input('输入网址:')

start_time = time.time()

main(url_d)

end_time = time.time()

print("耗时%d" % (end_time - start_time))

复制代码

小伤口 · 发表于 2021-8-31 23:48:14

arrexxxx 发表于 2021-8-31 21:20
请问这里运行不了该怎么解决求指教
Exception in thread Thread-11:
Traceback (most recent ...

你现在再运行一下试试

小伤口 · 发表于 2021-9-1 12:25:14

arrexxxx 发表于 2021-9-1 11:33
打印图片之后是这样子的报错T T

这个是正常现象因为好像这个方法要弃用了，所以会显示这个

账号		自动登录	找回密码
密码			立即注册

[已解决]xdm，爬虫求助

浏览过的版块