多线程爬取猫眼电影放入.csv文件中,Python交流,编程语言专区,鱼C论坛

maday 发表于 2020-6-10 22:07:04

多线程爬取猫眼电影放入.csv文件中

想爬取的时猫眼电影的榜单，应该是有100条数据的，为什么只能爬取75条数据，哪位大佬帮我看一下。{:10_277:}
import requests
import threading
from lxml import etree
import csv
from queue import Queue

class Product(threading.Thread):
header = {
   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}
def __init__(self, lock, page_que, html_que, *args, **kwargs):
   super(Product, self).__init__(*args, *kwargs)
   self.page_que = page_que
   self.html_que = html_que
   self.lock=lock

def run(self) -> None:

   while 1:
         if self.page_que.empty():
            break
         a = self.page_que.get()
         print('第%d页正在下载' % (a + 1))
         url = 'https://maoyan.com/board/4?offset=' + str(a) + '0'
         e = requests.get(url, headers=self.header).text
         html = etree.HTML(e)
         name = html.xpath("//div[@class='main']//dd/a/@title")
         actor = html.xpath("//div[@class='main']//p[@class='star']")
         star = []
         for i in actor:
            c = i.xpath('string(.)')
            c = str(c)
            c = c.strip()
            star.append(c)
         t = html.xpath("//div[@class='main']//p[@class='releasetime']")
         time = []
         for i in t:
            c = i.xpath('string(.)')
            c = str(c).strip()
            time.append(c)
         sour = html.xpath("//div[@class='main']//i[@class='integer']")
         fsre = []
         for i in sour:
            c = i.xpath('string(.)')
            c = str(c).strip()
            fsre.append(c)
         qq = html.xpath("//div[@class='main']//i[@class='fraction']")
         lsre = []
         for i in qq:
            c = i.xpath('string(.)')
            c = str(c).strip()
            lsre.append(c)
         source = []
         for i in range(10):
            r = fsre + lsre
            source.append(r)

         self.lock.acquire(blocking=0)
         for i in range(10):
            self.html_que.put((name, star, time, source))
         self.lock.release()

class Consumer(threading.Thread):
def __init__(self, lock, html_que, writer, *args, **kwargs):
   super(Consumer, self).__init__(*args, **kwargs)
   self.lock = lock
   self.html_que = html_que
   self.writer = writer

def run(self) -> None:
   while 1:
         try:
            self.lock.acquire(blocking=0)
            a = self.html_que.get()
            name, actor, time, score = a

            self.writer.writerow((name, actor, time, score))
            print("保存一条")
            self.lock.release
         except:
            break

def main():
lock = threading.Lock()
page_que = Queue(50)
html_que = Queue(5000)

f = open('1.csv', 'a', newline='', encoding='utf-8')
writer = csv.writer(f)
writer.writerow(('name', 'actor', 'time', 'score'))

for i in range(10):
   page_que.put(i)

t = Product(lock, page_que, html_que)
t.start()

for i in range(3):
   t = Consumer(lock, html_que, writer)
   t.start()

if __name__ == '__main__':
main()

maday 发表于 2020-6-10 23:01:30

{:10_266:}

maday 发表于 2020-6-11 08:21:10

救救我

xmpython 发表于 2020-6-11 08:57:08

多线程这么快不会被反爬吗{:10_257:}

maday 发表于 2020-6-11 10:02:57

xmpython 发表于 2020-6-11 08:57
多线程这么快不会被反爬吗

啊，是这个原因吗，反正怎么爬就只能获取前75行数据

页: [1]

鱼C论坛's Archiver

多线程爬取猫眼电影放入.csv文件中