|
发表于 2022-4-14 19:43:21
|
显示全部楼层
- # 导入支持库
- from urllib.request import urlopen, Request
- from bs4 import BeautifulSoup
- from lxml import etree # 需要从 lxml 模块 导入 etree
- import re
- import requests # 需要导入 requests 模块
- import pymysql
- import time
- # 设置Request对象
- headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36"
- }
- # 数据库连接
- connect = pymysql.Connect(
- host='localhost',
- port=3306,
- user='root',
- password='secureworks',
- db='test_spider',
- charset='utf8'
- )
- # 获取游标
- cursor = connect.cursor()
- # 执行插入语句
- sql_in = '''insert into doubanmovie (name,director,actor,style,country,release_time,time,score) value (%s,%s,%s,%s,%s,%s,%s,%s)'''
- def get_movie_url(url): # 获取每个电影的链接
- html = requests.get(url=url, headers=headers)
- selector = etree.HTML(html.text)
- movie_hrefs = selector.xpath('//div[@class="hd"]/a/@href')
- for movie_href in movie_hrefs:
- get_movie_info(movie_href)
- def get_movie_info(url):
- html = requests.get(url=url, headers=headers) # <Response [200]>
- selector = etree.HTML(html.text) # <Element html at 0x20892e10108>
- try:
- name = selector.xpath('//*[@id="content"]/h1/span[1]/text()')[0]
- except IndexError:
- name = ''
- try:
- director = selector.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0]
- except IndexError:
- director = ''
- try:
- actors = selector.xpath('//*[@class="actor"]/span[2]')[0]
- actor = actors.xpath('string(.)')
- # actors = re.findall('<a href="/.*?/" rel="v:starring">(.*?)</a>',html.text,re.S)
- except IndexError:
- actor = ''
- try:
- style = re.findall('<span property="v:genre">(.*?)</span>', html.text, re.S)[0]
- except IndexError:
- style = ''
- try:
- country = re.findall('<span class="pl">制片国家/地区:</span> (.*?)<br/>', html.text, re.S)[0]
- except IndexError:
- country = ''
- try:
- release_time = re.findall('<span property="v:initialReleaseDate" content=.*?>(.*?)</span>', html.text, re.S)[0]
- except IndexError:
- release_time = ''
- try:
- time = re.findall('<span property="v:runtime" content=.*?>(.*?)</span>', html.text, re.S)[0]
- except IndexError:
- time = ''
- try:
- score = selector.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]
- except IndexError:
- score = ''
- cursor.execute(
- sql_in,
- [str(name), str(director), str(actor), str(style), str(country), str(release_time), str(time), str(score)]
- )
- '''这一部分代码缩进错误'''
- urls = ['https://movie.douban.com/top250?start={}&filter='.format(i) for i in range(0, 250, 25)]
- for url in urls:
- get_movie_url(url)
- time.sleep(5)
- print('我好了!')
- connect.commit()
- '''这一部分代码缩进错误'''
复制代码 |
|