|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 slhlde 于 2019-12-19 23:29 编辑
import requests
import re
from lxml import etree
import time
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"}
def get_movie_info(url):
html=requests.get(url,headers=headers)
print('======打印网页状态,确认是否被反爬虫=======')
print(html.status_code)
selector=etree.HTML(html.text)
infos=selector.xpath('//div[@class="info"]')
print('======打印所有infos======')
print(infos)
for info in infos:
names=info.xpath('//div[@class="hd"]/a/span/text()')[0]
directors_actors=info.xpath('//div[@class="bd"]/p[1]/text()')[0].replace(" ", "").replace("\n", "")
dates=info.xpath('//div[@class="bd"]/p[1]/text()')[1].replace(" ", "").replace("\n", "").split("/")[0]
countrys=info.xpath('//div[@class="bd"]/p[1]/text()')[1].replace(" ", "").replace("\n", "").split("/")[1]
stypes=info.xpath('//div[@class="bd"]/p[1]/text()')[1].replace(" ", "").replace("\n", "").split("/")[2]
scores=info.xpath('//span[@class="rating_num"]/text()')[0]
comCounts=info.xpath('//div[@class="star"]/span[4]/text()')[0]
data = {
'name': names,
'directors_actor': directors_actors,
'date': dates,
'stype': stypes,
'country': countrys,
'score': scores,
'comCount':comCounts
}
print(data)
if __name__=="__main__":
urls=['https://movie.douban.com/top250?start={}&filter='.format(str(i)) for i in range(0,250,25)]
for url in urls:
print('==========打印当前的地址========')
print(url)
get_movie_info(url)
time.sleep(2)
本帖最后由 yjsx86 于 2019-12-20 21:08 编辑
xpath语法有问题 // 改为 .//
然后print(data)缩进有问题
- import requests
- import re
- from lxml import etree
- import time
- headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"}
- def get_movie_info(url):
- html=requests.get(url,headers=headers)
- print('======打印网页状态,确认是否被反爬虫=======')
- print(html.status_code)
- selector=etree.HTML(html.text)
- infos=selector.xpath('//div[@class="info"]')
- print('======打印所有infos======')
- print(infos)
- for info in infos:
- names=info.xpath('.//div[@class="hd"]/a/span/text()')[0]
- directors_actors=info.xpath('.//div[@class="bd"]/p[1]/text()')[0].replace(" ", "").replace("\n", "")
- dates=info.xpath('.//div[@class="bd"]/p[1]/text()')[1].replace(" ", "").replace("\n", "").split("/")[0]
- countrys=info.xpath('.//div[@class="bd"]/p[1]/text()')[1].replace(" ", "").replace("\n", "").split("/")[1]
- stypes=info.xpath('.//div[@class="bd"]/p[1]/text()')[1].replace(" ", "").replace("\n", "").split("/")[2]
- scores=info.xpath('.//span[@class="rating_num"]/text()')[0]
- comCounts=info.xpath('.//div[@class="star"]/span[4]/text()')[0]
- data = {
- 'name': names,
- 'directors_actor': directors_actors,
- 'date': dates,
- 'stype': stypes,
- 'country': countrys,
- 'score': scores,
- 'comCount':comCounts
- }
- print(data)
- if __name__=="__main__":
- urls=['https://movie.douban.com/top250?start={}&filter='.format(str(i)) for i in range(0,250,25)]
- for url in urls:
- print('==========打印当前的地址========')
- print(url)
- get_movie_info(url)
- time.sleep(2)
复制代码
|
|