|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
最近学了一点爬虫但是不知道这个为什么这个程序一直报错
- import urllib.request
- from lxml import etree
- def create_response(page):
- if 1 == page:
- url = 'https://sc.chinaz.com/tu/meinv.html'
- else:
- str(page)
- url = "https://sc.chinaz.com/tu/meinv-%s-0-0.html"%page
- headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
- }
- response = urllib.request.Request(url, headers = headers)
- return response
- def get_info(response):
- request = urllib.request.urlopen(response)
- content = request.read().decode('utf-8')
- return content
- def down_img(content):
- tree = etree.HTML(content)
- nameList = tree.xpath('//div[@id="ulcontent"]//div//img/@alt')
- imgList = tree.xpath('//div[@id="ulcontent"]//div//img/@data-src')
- for i in range(len(nameList) - 1):
- img = 'https:' + imgList[i]
- name = nameList[i]
- print(img)
- # urllib.request.urlretrieve(img, name + '.jpg')
- print("Over")
- if __name__ == '__main__':
- startPage = int(input("请输入开始页:"))
- endPage = int(input("请输入结束页:"))
- for page in range(startPage, endPage + 1):
- response = create_response(page)
- content = get_info(response)
- down_img(content)
复制代码
注意:你的网址中有\,应当全部替换为/
- import urllib.request
- from lxml import etree
- def create_response(page):
- if 1 == page:
- url = 'https://sc.chinaz.com/tu/meinv.html'
- else:
- str(page)
- url = "https://sc.chinaz.com/tu/meinv-%s-0-0.html"%page
- headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
- }
- response = urllib.request.Request(url, headers = headers)
- return response
- def get_info(response):
- request = urllib.request.urlopen(response)
- content = request.read().decode('utf-8')
- return content
- def down_img(content):
- tree = etree.HTML(content)
- nameList = tree.xpath('//div[@id="ulcontent"]//div//img/@alt')
- imgList = tree.xpath('//div[@id="ulcontent"]//div//img/@data-src')
- for i in range(len(nameList) - 1):
- img = 'https:' + imgList[i]
- name = nameList[i]
- print(img)
- urllib.request.urlretrieve(img.replace("\","/"), name + '.jpg') # 注意:你的网址中有\,应当全部替换为/
- print("Over")
- if __name__ == '__main__':
- startPage = int(input("请输入开始页:"))
- endPage = int(input("请输入结束页:"))
- for page in range(startPage, endPage + 1):
- response = create_response(page)
- content = get_info(response)
- down_img(content)
复制代码
|
|