因为没有累加,len一直是25,改成累加方式:import requests
import re
def main():
base_url = 'https://movie.douban.com/top250?'
headers = {'user-agent': 'firefox'}
m_name = []
m_href = []
m_star = []
for x in range(0, 250, 25):
url = base_url + f'start={x}'
r = requests.get(url, headers=headers)
t_name = re.findall(r'<span class="title">(.*?)</span>', r.text)
t_href = re.findall(r'<a href="(.*?)" class="">', r.text)
t_star = re.findall(r'property="v:average">(.*?)</span>', r.text)
f_name = []
for item in t_name:
if 'nbsp' not in item:
f_name.append(item)
m_name.extend(f_name)
m_href.extend(t_href)
m_star.extend(t_star)
print(m_name, m_href, m_star)
if __name__ == '__main__':
main()
另外爬数据,还是urllib+xpath合适,selenium适合需要UI操作的环境。import requests
from lxml import etree
def main():
base_url = 'https://movie.douban.com/top250?'
headers = {'user-agent': 'firefox'}
movies = []
for x in range(0, 250, 25):
url = base_url + f'start={x}'
r = requests.get(url, headers=headers)
html = etree.HTML(r.text)
lis = html.xpath('//ol[@class="grid_view"]/li')
for li in lis:
m_name = li.xpath('.//div[@class="hd"]/a/span[1]/text()')[0]
m_rank = li.xpath('.//div[@class="pic"]/em/text()')[0]
m_score = li.xpath('.//div[@class="star"]/span[2]/text()')[0]
m_comment = li.xpath('./div/div[2]/div[2]/p[2]/span/text()') # 获取评价
if not m_comment: # 如果为空
m_comment = '没有评价' # 写入没有评价
else:
m_comment = m_comment[0] # 如果有值,那么取出内容(因为xpath取出后是列表)
movies.append([m_name, m_rank, m_score, m_comment])
for i in movies:
print(i)
print('=' * 100)
if __name__ == '__main__':
main()
|