|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
想爬取的时猫眼电影的榜单,应该是有100条数据的,为什么只能爬取75条数据,哪位大佬帮我看一下。
import requests
import threading
from lxml import etree
import csv
from queue import Queue
class Product(threading.Thread):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}
def __init__(self, lock, page_que, html_que, *args, **kwargs):
super(Product, self).__init__(*args, *kwargs)
self.page_que = page_que
self.html_que = html_que
self.lock=lock
def run(self) -> None:
while 1:
if self.page_que.empty():
break
a = self.page_que.get()
print('第%d页正在下载' % (a + 1))
url = 'https://maoyan.com/board/4?offset=' + str(a) + '0'
e = requests.get(url, headers=self.header).text
html = etree.HTML(e)
name = html.xpath("//div[@class='main']//dd/a/@title")
actor = html.xpath("//div[@class='main']//p[@class='star']")
star = []
for i in actor:
c = i.xpath('string(.)')
c = str(c)
c = c.strip()
star.append(c)
t = html.xpath("//div[@class='main']//p[@class='releasetime']")
time = []
for i in t:
c = i.xpath('string(.)')
c = str(c).strip()
time.append(c)
sour = html.xpath("//div[@class='main']//i[@class='integer']")
fsre = []
for i in sour:
c = i.xpath('string(.)')
c = str(c).strip()
fsre.append(c)
qq = html.xpath("//div[@class='main']//i[@class='fraction']")
lsre = []
for i in qq:
c = i.xpath('string(.)')
c = str(c).strip()
lsre.append(c)
source = []
for i in range(10):
r = fsre[i] + lsre[i]
source.append(r)
self.lock.acquire(blocking=0)
for i in range(10):
self.html_que.put((name[i], star[i], time[i], source[i]))
self.lock.release()
class Consumer(threading.Thread):
def __init__(self, lock, html_que, writer, *args, **kwargs):
super(Consumer, self).__init__(*args, **kwargs)
self.lock = lock
self.html_que = html_que
self.writer = writer
def run(self) -> None:
while 1:
try:
self.lock.acquire(blocking=0)
a = self.html_que.get()
name, actor, time, score = a
self.writer.writerow((name, actor, time, score))
print("保存一条")
self.lock.release
except:
break
def main():
lock = threading.Lock()
page_que = Queue(50)
html_que = Queue(5000)
f = open('1.csv', 'a', newline='', encoding='utf-8')
writer = csv.writer(f)
writer.writerow(('name', 'actor', 'time', 'score'))
for i in range(10):
page_que.put(i)
t = Product(lock, page_que, html_que)
t.start()
for i in range(3):
t = Consumer(lock, html_que, writer)
t.start()
if __name__ == '__main__':
main()
|
|